# This is an example SSS file formed by taking a bit of the Java source code of the SSS reference implementation and SSS-izing it. It serves as an extended example of the syntax, as an example of what an SSS programming language might look like, and (for those that know Java) as an illustration of the difference between keywords, constants and identifiers.
# The original Java file was an early version of the code, and I now know that it contained some bugs. I have not bothered to fix this version. You should therefore not use this as an example of how to lex an SSS file!
# Games you can play:
#  - Place a " character at the beginning of the file. That makes a literal string which ends just a moment ago.
#  - Place a " character just after this comment. That makes a literal string that ends at '"' (which used to be a character constant) at about line 30.

PACKAGE "org.sc3d.apt.sss.v3";

IMPORT "java.io";

# Represents a lexical analysis of a Sentence. The analysis recognises comments, keywords (including punctuation strings and separator characters), constants, identifers, numbers and strings, which it wraps up as Tokens with appropriate types. All white-space is discarded.
PUBLIC CLASS Lex {
  # Constructs a Lex representing the lexical structure of 'sentence'.
  PUBLIC Lex(Sentence sentence) {
    THIS.Sentence = sentence;
    FINAL TokenBuffer tb = NEW TokenBuffer();
    BOOLEAN suppress = False; # Prevents multiple 'Illegal character' messages.
    for (INT i=0; i<sentence.Length; ) {
      Token t = Null;
      FINAL CHAR c = sentence.Get(i);
      SWITCH (c<Cc.Length ? Cc[c] : 0) {
        CASE CWhiteSpace { BREAK; }
        CASE CLone { t = NEW Token(Token.TypeWord, sentence, i, 1); BREAK; }
        CASE CPunctuation { t = THIS.LexPunctuation(i); BREAK; }
        CASE CLetter { t = THIS.LexAlphanumeric(i); BREAK; }
        CASE CDigit { t = SSSNumber.Lex(sentence, i); BREAK; }
        CASE CSpecial { SWITCH(c) {
          CASE '#' { t = THIS.LexComment(i); BREAK; }
          CASE '"' { t = THIS.LexString(i); BREAK; }
          CASE ''' { t = THIS.LexChar(i); BREAK; }
          DEFAULT {
            THROW NEW RuntimeException("Don't know what to do with "+c);
          }
        }}
        DEFAULT {
          IF (!suppress) sentence.AddError(
            "This is an illegal character, which must not appear except in "+
            "comments and strings",
            i, 1
          );
          suppress = True;
          BREAK;
        }
      }
      IF (t!=Null) {
        tb.Append(t);
        i += t.Length;
      } ELSE {
        i++;
      }
    }
    THIS.Tokens = tb.ToArray();
    THIS.NumTokens = THIS.Tokens.Length;
  }
  
  # New API.
  
  # The Sentence of which THIS is the lexical analysis.
  PUBLIC FINAL Sentence Sentence;
  
  # The number of lexographic Tokens found in 'Sentence'.
  PUBLIC FINAL INT NumTokens;
  
  # Returns one of the lexographic Tokens.
  # @param index the index of the Token, from '0' to 'numTokens-1'.
  PUBLIC Token GetToken(INT index) { RETURN THIS.Tokens[index]; }
  
  # Private.
  
  # Reads a maximal string of SSS punctuation characters starting at 'start' and returns a Token of type 'TypeWord'.
  PRIVATE Token LexPunctuation(FINAL INT start) {
    INT i = start;
    WHILE (i<THIS.Sentence.Length) {
      FINAL CHAR c = THIS.Sentence.Get(i);
      IF (c>Cc.Length | Cc[c]!=CPunctuation) BREAK;
      i++;
    }
    RETURN NEW Token(Token.TypeWord, THIS.Sentence, start, i-start);
  }
  
  # Reads a maximal string of alphanumeric characters starting at 'start' and returns a Token of type 'TypeWord', 'TypeConstant' or 'TypeIdentifier' as appropriate.
  PRIVATE Token LexAlphanumeric(FINAL INT start) {
    INT i = start;
    IF (i>=THIS.Sentence.Length) THROW NEW IllegalArgumentException();
    CHAR c = THIS.Sentence.Get(i++);
    BOOLEAN isInitialCapital = c>='A' && c<='Z';
    BOOLEAN isAllCapital = isInitialCapital;
    WHILE (i<THIS.Sentence.Length) {
      c = THIS.Sentence.Get(i);
      IF ((c>='a' && c<='z') || (c>='0' && c<='9')) isAllCapital = false;
      ELSE IF (c<'A' || c>='Z') BREAK;
      i++;
    }
    FINAL INT length = i-start;
    RETURN NEW Token(
      isAllCapital && length>1 ? Token.TypeWord :
      isInitialCapital ? Token.TypeConstant : Token.TypeIdentifier,
      THIS.Sentence, start, length
    );
  }
  
  # Lexes an SSS comment starting at 'start', and RETURNs a Token with type 'TypeComment'.
  PRIVATE Token LexComment(FINAL INT start) {
    INT i = start;
    IF (i>=THIS.Sentence.Length || THIS.Sentence.Get(i++)!='#') {
      THROW NEW IllegalArgumentException("Comments must start with #");
    }
    INT numBrackets = 0, numBraces = 0;
    WHILE (i<THIS.Sentence.Length) SWITCH (THIS.Sentence.Get(i++)) {
      CASE '(' { numBrackets++; BREAK; }
      CASE ')' { numBrackets--; BREAK; }
      CASE '{' { numBraces++; BREAK; }
      CASE '}' { numBraces--; BREAK; }
      CASE '#' CASE '\A/' {
        IF (numBrackets==0 && numBraces==0) {
          RETURN NEW Token(Token.TypeComment, THIS.Sentence, start, i-start);
        }
        BREAK;
      }
      DEFAULT { BREAK; }
    }
    FINAL Token ans = (
      NEW Token(Token.TypeComment, THIS.Sentence, start, i-start)
    );
    IF (numBrackets!=0 || numBraces!=0) ans.AddError(
      "This comment does not end. There's a '"+
      (numBrackets<0 ? '(' : numBrackets>0 ? ')' : numBraces<0 ? '{' : '}')+
      "' character missing somewhere."
    );
    RETURN ans;
  }

  # Reads an SSS string literal from 'sentence' starting at 'start' and returns a Token with type 'TypeString'.
  PRIVATE Token LexString(FINAL INT start) {
    INT i = start;
    IF (i>=THIS.Sentence.Length || THIS.Sentence.Get(i++)!='"') {
      THROW NEW IllegalArgumentException("Strings must start with \22/");
    }
    WHILE (i<THIS.Sentence.Length) {
      FINAL CHAR c = THIS.Sentence.Get(i);
      IF (c=='"') {
        RETURN NEW Token(Token.TypeString, THIS.Sentence, start, i-start);
      }
      IF (c=='\5C/') {
        FINAL INT l = THIS.LexEscape(i);
        IF (l>0) i += l;
        ELSE {
          THIS.Sentence.AddError("Malformed escape sequence.", i, 1);
          i++;
        }
      } ELSE {
        i++;
      }
    }
    FINAL Token ans = (
      NEW Token(Token.TypeString, THIS.Sentence, start, i-start)
    );
    ans.AddError(
      "This string does not end. There's a \22/ character missing somewhere."
    );
    RETURN ans;
  }
  
  # Reads an SSS character literal from 'sentence' starting at 'start' and returns a Token of type 'TypeChar'.
  PRIVATE Token LexCHAR(FINAL INT start) {
    INT i = start;
    IF (i>=THIS.Sentence.Length || THIS.Sentence.Get(i++)!=''') {
      THROW NEW IllegalArgumentException("characters must start with '");
    }
    IF (i<THIS.Sentence.Length && THIS.Sentence.Get(i)!='\5C/') i++;
    ELSE {
      INT l = THIS.LexEscape(i);
      IF (l>0) i += l;
      ELSE {
        THIS.Sentence.AddError("Malformed escape sequence.", i, 1);
        i++;
      }
    }
    IF (i<THIS.Sentence.Length && THIS.Sentence.Get(i)==''') i++;
    ELSE THIS.Sentence.AddError("There's a ' character missing here.", i, 1);
    RETURN NEW Token(Token.TypeChar, THIS.Sentence, start, i-start);
  }
  
  # Reads an SSS character escape from 'sentence' starting at 'start', and returns its length. If this method does not find a syntactically correct escape sequence, it returns '0'. A syntactically correct escape sequence consists of zero, one, two, three or four hexadecimal digits enclosed in '\' characters.
  PRIVATE INT LexEscape(INT start) {
    IF (start>=THIS.Sentence.Length) RETURN 0;
    IF (THIS.Sentence.Get(start)!='\5C/') RETURN 0;
    FOR (INT i=1; i<=5; i++) {
      IF (start+i>=THIS.Sentence.Length) RETURN 0;
      FINAL CHAR c = THIS.Sentence.Get(start+i);
      IF (c=='\5C/') RETURN i+1;
      IF ((c<'0' || c>'9') && (c<'A' || c>='F')) RETURN 0;
    }
    RETURN 0;
  }
  
  # An array of length 'numTokens', containing the lexographic Tokens.
  PRIVATE FINAL Token[] tokens;
  
  # The value stored in 'CC' for white-space characters.
  PRIVATE STATIC FINAL INT CWhiteSpace = 1;
  
  # The value stored in 'CC' for characters that form a Token all on their own.
  PRIVATE STATIC FINAL INT CLone = 2;
  
  # The value stored in 'CC' for SSS punctuation characters.
  PRIVATE STATIC FINAL INT CPunctuation = 3;
  
  # The vlaue stored in 'CC' for letters.
  PRIVATE STATIC FINAL INT CLetter = 4;
  
  # The value stored in 'CC' for decimal digits.
  PRIVATE STATIC FINAL INT CDigit = 5;
  
  # The value stored in 'CC' for special characters that introduce things.
  PRIVATE STATIC FINAL INT CSpecial = 6;
  
  # An array which classifies ASCII characters into one of the 'CXxx' classes.
  PRIVATE STATIC FINAL INT[] Cc = NEW INT[127];
  STATIC {
    Classify(CWhiteSpace, " \9/\A/\D/");
    Classify(CLone, ",.;(){}");
    Classify(CLetter, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    Classify(CDigit, "0123456789");
    Classify(CSpecial, "#\22/'");
    Classify(CPunctuation, "!$%&*+-/:<=>?@[\5C/]^_`|");
  }
  
  PRIVATE STATIC VOID Classify(INT type, String s) {
    FOR (INT i=0; i<s.Length(); i++) CC[s.CharAt(i)] = type;
  }
  
  # Test code.
  
  # Takes one argument: a filename. Reads the file and lexes it. If there are errors, prints an error report. Otherwise lists the Tokens.
  PUBLIC STATIC VOID Main(String[] args) THROWS IOException {
    IF (args.Length!=1) throw NEW IllegalArgumentException(
      "Syntax: java org.sc3d.apt.sss.v3.Lex <filename>"
    );
    FINAL Sentence sentence = Sentence.ReadFile(args[0]);
    FINAL Lex me = NEW Lex(sentence);
    IF (sentence.CountErrors()>0) {
      sentence.PrintErrorReport(System.Out, 100);
      RETURN;
    }
    FOR (INT i=0; i<me.NumTokens; i++) {
      System.Out.Println("Tokens["+i+"] = "+me.GetToken(i));
    }
  }
}