Notes on building lexers

The following notes detail my preferred strategy for building UTF-8 lexers in C++. This strategy uses little specific to C++, and could be easily adapted to C with minimal effort.

If you are unfamiliar with Unicode or the idea of a DFA and how it can be used to implement regular languages, you may want to read about them before proceeding.

A lexer class will have the following instance variables:

The class should also have the following methods. Methods beginning with underscores are protected.

The following psuedocode provides a general outline.

struct Lexer:
  void Reset()
    ...
  void SetBuffer(const char *buf, unsigned buf_len)
    ...
  int GetNextToken(tok)
    while !_IsEnd() || reissue
      if _reissue: _reissue := false
      else
        _Advance()         (return if fail)
      _StateDispatch(tok)  (return if != 0)

    return 'EOF Error'

  int GetFinalToken(tok)
    if done: return 'EOF Error'
    done := true
    c := 0
    error_code := _StateDispatch(tok)   (return if error)
    if state ≠ 0: return 'Unexpected Error'
    return 0 if error_code = 1 [Token Yielded], 'EOF Error' otherwise

  bool _IsEnd(): ...
  bool _Reissue(): reissue := true

  int _Advance()
    if _IsEnd(): return 'EOF Error'
    if bytemode
      _c = *buf_next++
    else
      var unsigned v
      var int nr
      if midchar
        extra := 6 - residual_len
        memcpy(residual + residual_len, buf_next, extra)
        num_bytes_read := utf8_to_utf32(residual, 6, out v)
          (return 'Invalid Argument Error' if fail)

        nr := nr - residual_len
        midchar := false
      else
        nr := utf8_to_utf32(buf_next, buf_end-buf_next, out v)
        if nr = 'EOF Error'
          midchar := true
          residual_len := buf_end - buf_next
          memcpy(residual, buf_next, residual_len)
          buf_next := buf_end
          return 'EOF Error'
        else if nr < 0
          return nr

      buf_next := buf_next + nr
      c := v

      if c = '\n'
        linenum := linenum + 1
        colnum := 1
      else
        colnum := colnum + 1

      return 0

  int _StateDispatch(tok)
    switch state
      case 0: return _SDefault(tok)
      case ...: return ...(tok)
      ...

      default: UNREACHABLE (assert)

  bool done, reissue, midchar
  unsigned state, linenum, column, c, residual_len
  char residual[6]
  const char *buf_next, *buf_end