Source code for nmrstarlib.bmrblex

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
nmrstarlib.bmrblex
~~~~~~~~~~~~~~~~~~

This module provides :func:`~nmrstarlib.bmrblex.bmrblex` lexical analyzer for
BMRB NMR-STAR format syntax. It is implemented as Python generator-based
state machine which generates (yields) token one at a time when
:py:func:`next()` is invoked on :func:`~nmrstarlib.bmrblex.bmrblex` instance.


Simplified description of parsing rules:
----------------------------------------
   * Each word or number separated by whitespace characters is a separate BMRB token.
   * Each single quoted (') string is a separate BMRB token, it should start with a single quote (')
     and end with a single quote *always* followed by whitespace character(s).
   * Each double quoted (") string is a separate BMRB token, it should start with a double quote (")
     and end with a double quote *always* followed by whitespace character(s).
   * Single quoted and double quoted strings have to be processed separately.
   * Single quoted and double quoted strings are processed one character at a time.
   * Multiline strings start with a semicolon *always* followed by new line character and
     ending with a semicolon *always* followed by whitespace character(s).
   * Multiline strings are processed one line at a time.

.. note::
   * For a full description of NMR-STAR file format, see official documentation:
     http://www.bmrb.wisc.edu/dictionary/
   * For a concise description of the NMR-STAR file format grammar see:
     https://github.com/mattfenwick/NMRPyStar#nmr-star-grammar
"""

from collections import deque


def transform_text(input_txt):
    """Transforms text into :py:class:`~collections.deque`, pre-processes
    multiline strings.

    :param str or bytes input_txt: Input text.
    :return: Double-ended queue of single characters and multiline strings.
    :rtype: :py:class:`~collections.deque`
    """
    if isinstance(input_txt, str):
        text = u"{}".format(input_txt)
    elif isinstance(input_txt, bytes):
        text = input_txt.decode("utf-8")
    else:
        raise TypeError("Expecting <class 'str'> or <class 'bytes'>, but {} was passed".format(type(input_txt)))

    inputq = deque(text.split(u"\n"))
    outputq = deque()

    while len(inputq) > 0:
        line = inputq.popleft()

        if line.lstrip().startswith(u"#"):
            comment = u"" + line + u"\n"
            line = inputq.popleft()

            while line.lstrip().startswith(u"#"):
                comment += line + u"\n"
                line = inputq.popleft()

            outputq.append(comment)

            for character in line:
                outputq.append(character)

        elif line.startswith(u";"):
            multiline = u""
            multiline += line + u"\n"
            line = inputq.popleft()

            while not line.startswith(u";"):
                multiline += line + u"\n"
                line = inputq.popleft()

            multiline += line[:1]
            outputq.append(multiline[1:-1])  # remove STAR syntax from multiline string

            for character in line[1:]:
                outputq.append(character)

        else:
            for character in line:
                outputq.append(character)

        outputq.append(u"\n")

    outputq.extend([u"\n", u""])  # end of file signal

    return outputq


[docs]def bmrblex(text):
    """A lexical analyzer for the BMRB NMR-STAR format syntax.

    :param text: Input text.
    :type text: :py:class:`str` or :py:class:`bytes`
    :return: Current token.
    :rtype: :py:class:`str`
    """
    stream = transform_text(text)

    wordchars = (u"abcdfeghijklmnopqrstuvwxyz"
                 u"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
                 u"ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
                 u"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ"
                 u"!@$%^&*()_+:;?/>.<,~`|\{[}]-=")

    whitespace = u" \t\v\r\n"
    comment = u"#"
    state = u" "
    token = u""
    single_line_comment = u""

    while len(stream) > 0:
        nextnextchar = stream.popleft()

        while True:
            nextchar = nextnextchar

            if len(stream) > 0:
                nextnextchar = stream.popleft()
            else:
                nextnextchar = u""

            # Process multiline string, comment, or single line comment
            if len(nextchar) > 1:
                state = u" "
                token = nextchar
                break  # emit current token

            elif nextchar in whitespace and nextnextchar in comment and state not in (u"'", u'"'):
                single_line_comment = u""
                state = u"#"

            if state is None:
                token = u""  # past end of file
                break

            elif state == u" ":
                if not nextchar:
                    state = None
                    break

                elif nextchar in whitespace:
                    if token:
                        state = u" "
                        break  # emit current token
                    else:
                        continue

                elif nextchar in wordchars:
                    token = nextchar
                    state = u"a"

                elif nextchar == u"'" or nextchar == u'"':
                    token = nextchar
                    state = nextchar

                else:
                    token = nextchar
                    if token:
                        state = u" "
                        break  # emit current token
                    else:
                        continue

            # Process single-quoted or double-quoted token
            elif state == u"'" or state == u'"':
                token += nextchar
                if nextchar == state:
                    if nextnextchar in whitespace:
                        state = u" "
                        token = token[1:-1]  # remove single or double quotes from the ends
                        break

            # Process single line comment
            elif state == u"#":
                single_line_comment += nextchar
                if nextchar == u"\n":
                    state = u" "
                    break

            # Process regular (unquoted) token
            elif state == u"a":
                if not nextchar:
                    state = None
                    break
                elif nextchar in whitespace:
                    state = u" "
                    if token:
                        break  # emit current token
                    else:
                        continue
                else:
                    token += nextchar

        if nextnextchar:
            stream.appendleft(nextnextchar)

        yield token
        token = u""