/******************************************************************************
* Module    :   Regular Expression Compiling.
*
* Author    :   John W. M. Stevens.
*
* Notes     :   Use UNIX style regular expressions.  Grammar is:
*
*   REG_EXPR    ::= ANCHOR
*                   | ANCHOR '|' REG_EXPR
*
*   ANCHOR      ::= CATENATION
*                   | '^' CATENATION
*                   | CATENATION '$'
*                   | '^' CATENATION '$'
*
*   CATENATION  ::= PHRASE
*                   | PHRASE CATENATION
*
*   PHRASE      ::= UNARY
*                   | UNARY ENUM_OP
*
*   ENUM_OP     ::= '*'
*                   | '+'
*                   | '?'
*                   | '{' SPAN_RNG '}'
*
*   SPAN_RNG    ::= NUMBER
*                   | NUMBER ',' NUMBER
*
*   UNARY       ::= '(' REG_EXPR ')'
*                   | CHAR_STR
*                   | '[' SET ']'
*                   | '[' '^' SET ']'
*                   | '.'
*
*   SET         ::= CHAR_STR
*                   | CHAR_STR SET
*                   | CHAR '-' CHAR
*                   | CHAR '-' CHAR SET
*
*   CHAR_STR    ::= CHARACTER
*                   | CHARACTER CHARACTER_STR
*
*   CHARACTER   ::= ' ' - '~' except for []()|{}+*? which are special without
*                   being escaped.
******************************************************************************/

#include    "compiler.h"

#include    "unpost.h"
#include    "sets.h"
#include    "regexp.h"
#include    "utils.h"

/*  Character Sets. */
static  int     ReInitFlag = 0;
static  char    *SpecStr = "[().|$";
static  SET     SpecSet;
static  char    *PostFixStr = "{*?+";
static  SET     PostSet;
static  char    *DecStr = "0-9";
static  SET     DecSet;

static  UINT    SubExprNo;

static  REG_EXP_NODE    *Alternation(char **);

#if defined( RE_TEST )

/*-----------------------------------------------------------------------------
| Routine   :   PrtReExpr() --- Print a regular expression for debugging
|               purposes.  (This may be used in the configuration helper
|               in a later release, so make it pretty).
|
| Inputs    :   Node    - Pointer to current level regular expression node.
|               Level   - Current level in tree.
-----------------------------------------------------------------------------*/

static
void    PrtReExpr(REG_EXP_NODE  *Node,
                  int           Level)
{
    register    int     i;
    register    int     j;

    /*  Traverse right to left so that the tree can be printed
    *   on a screen or page rotated 90 degrees to the left.
    *
    *   (Right to left == top to bottom of screen or page)
    */
    if ( Node->Right )
        PrtReExpr(Node->Right, Level + 1);

    /*  Print indentation.  */
    for (i = 0; i < Level; i++)
        printf("    ");

    /*  Print node type and information.    */
    switch ( Node->NodeType )
    {
    case OP_L_PAREN:
        printf("(\n");
        break;
    case DATA_LEFT_ANCHOR:
        printf("^\n");
        break;
    case DATA_RIGHT_ANCHOR:
        printf("$\n");
        break;
    case OP_ENUM:
        printf("Enum {%d, %u}\n",
                Node->MinSpan,
                Node->MaxSpan);
        break;
    case OP_OR:
        printf("|\n");
        break;
    case OP_AND:
        printf("&\n");
        break;
    case DATA_ANY:
        printf(".\n");
        break;
    case DATA_SPAN:
        printf("Span {%d, %u}\n",
                Node->MinSpan,
                Node->MaxSpan);
        break;
    case DATA_STRING:
        printf("'%s'\n",
                Node->data.MatchStr,
                Node->SubExprNo);
        break;
    case DATA_SET:
        printf("[");
        for (j = 0; j < 128; j++)
            if ( InSet(Node->data.CSet, (unsigned char) j) )
            {
                if (j < 32 || j > 127)
                    printf("\\x%02x");
                else
                    putchar( (char) j );
            }
        printf("]  %d\n", Node->SubExprNo);
        break;
    case NODE_TYPE_NOT_SET:
    default:
        printf(">>>> ERROR !, no node type set.\n");
        break;
    }

    /*  Now print left child.   */
    if ( Node->Left )
        PrtReExpr(Node->Left, Level + 1);
}

#endif

/*-----------------------------------------------------------------------------
| Routine   :   AllocRegExpNode() --- Allocate a regular expression node.
|
| Returns   :   Returns a pointer to the newly allocated regular expression
|               node.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *AllocRegExpNode(void)
{
    auto        REG_EXP_NODE    *New;
    extern      FILE            *ErrFile;

    /*  Allocate the node.  */
    if ((New = (REG_EXP_NODE *) calloc(1, sizeof( REG_EXP_NODE ))) == NULL)
    {
        fprintf(ErrFile,
                "%s %d : Out of memory.\n",
                __FILE__,
                __LINE__);
        exit( 1 );
    }
    return( New );
}

/*-----------------------------------------------------------------------------
| Routine   :   GetInt() --- Get an integer number for the case of
|               regular expression enumeration.
|
| Inputs    :   Str     - Pointer to string to get number from.
| Outputs   :   Str     - Pointer to first character in string after number.
|
| Returns   :   Returns the integer read from the string.
-----------------------------------------------------------------------------*/

static
int     GetInt(char **Str)
{
    auto        int     i;

    /*  Strip white space.  */
    while (**Str == ' ' || **Str == '\t' || **Str == '\n')
        (*Str)++;

    /*  Get number. */
    i = 0;
    while ( InSet(DecSet, (unsigned char) **Str) )
        i = i * 10 + (*(*Str)++ - '0');

    /*  Strip trailing white space. */
    while (**Str == ' ' || **Str == '\t' || **Str == '\n')
        (*Str)++;

    /*  Return number.  */
    return( i );
}

/*-----------------------------------------------------------------------------
| Routine   :   EnumOp() --- Get the enumeration modifer.
|               node.
|
| Inputs    :   Str     - Pointer to enumeration operator.
|               Root    - Pointer to regular expression node.
| Outputs   :   Str     - Pointer after enumeration operator.
|
| Note      :   A value of zero for maximum span value indicates ANY
|               number.
-----------------------------------------------------------------------------*/

static
void    EnumOp(char         **Str,
               REG_EXP_NODE *Root)
{
    extern      FILE    *ErrFile;

    /*  Set up the operator and enumerator values.  */
    switch ( *(*Str)++ )
    {
    case '+':
        Root->MinSpan = 1;
        Root->MaxSpan = ~0;
        break;
    case '*':
        Root->MinSpan = 0;
        Root->MaxSpan = ~0;
        break;
    case '?':
        Root->MinSpan = 0;
        Root->MaxSpan = 1;
        break;
    case '{':
        /*  Get specifically enumerated span.   */
        Root->MinSpan = GetInt( Str );
        Root->MaxSpan = Root->MinSpan;

        /*  Is there a maximum number of characters?    */
        if (**Str == ',')
        {
            /*  Get maximum span.   */
            (*Str)++;

            /*  Strip white space.  */
            while (**Str == ' ' || **Str == '\t' || **Str == '\n')
                (*Str)++;

            /*  Set to maximum, or get maximum. */
            if (**Str == '}')
                Root->MaxSpan = ~0;
            else if ( InSet(DecSet, (unsigned char) **Str) )
                Root->MaxSpan = GetInt( Str );
        }

        /*  Check for end brace.    */
        if (**Str != '}')
        {
            fprintf(ErrFile,
                    "%s %d : Error - missing '}' in regular expression.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }
        (*Str)++;
        break;
    }
}

/*-----------------------------------------------------------------------------
| Routine   :   Unary() --- Unary Operations.
|
| Inputs    :   Str - Pointer to current character in source RE string.
|
| Returns   :   Pointer to regular expression node.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *Unary(char **Str)
{
    auto        REG_EXP_NODE    *Node;
    auto        char            Buffer[256];
    auto        char            *Tmp;
    extern      FILE            *ErrFile;

    /*  Get the regular expression atoms.   */
    switch ( **Str )
    {
    case '.':
        /*  Allocate a regular expression node. */
        (*Str)++;
        Node = AllocRegExpNode();
        Node->NodeType = DATA_ANY;
        break;
    case '[':
        /*  Allocate a regular expression node. */
        Node = AllocRegExpNode();

        /*  Allocate a set. */
        if ((Node->data.CSet = (SET_TYPE *) calloc(1, SET_SIZE *
        sizeof( SET_TYPE ))) == NULL)
        {
            fprintf(ErrFile,
                    "%s %d : Out of memory.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }

        /*  Create the set. */
        (*Str)++;
        CrtSet((unsigned char **) Str, Node->data.CSet);
        Node->NodeType = DATA_SET;
        break;
    case '(':
        /*  Skip parentheses. */
        (*Str)++;

        /*  Allocate a regular expression node. */
        Node = AllocRegExpNode();
        Node->NodeType = OP_L_PAREN;

        /*  Save the sub expression number. */
        if (SubExprNo > MAX_SUB_EXPRS)
        {
            fprintf(ErrFile,
                    "%s %d : Error - To many sub-expressions.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }
        Node->SubExprNo = SubExprNo++;

        /*  Get sub expression. */
        Node->Right = Alternation( Str );

        /*  Check for end parentheses.  */
        if (**Str != ')')
        {
            fprintf(ErrFile,
                    "%s %d : Error - missing ')' in regular expression.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }
        (*Str)++;
        break;
    default:
        /*  Check for badly formed regular expression.  */
        if (InSet(SpecSet, (unsigned char) **Str) ||
            InSet(PostSet, (unsigned char) **Str))
        {
            fprintf(ErrFile,
                    "%s %d : Error - badly formed regular expression.\n",
                    __FILE__,
                    __LINE__);
            fprintf(ErrFile,
                    "\tUnexpected character '%c'\n",
                    **Str);
            exit( 1 );
        }

        /*  Allocate a regular expression node. */
        Node = AllocRegExpNode();

        /*  Get characters while they are not in the set of special
        *   characters.
        */
        for (Tmp = Buffer; **Str; )
            if (**Str == '\\' && (*Str)[1])
            {
                *Tmp++ = *++*Str;
                ++*Str;
            }
            else if (InSet(SpecSet, (unsigned char) **Str)||
                     InSet(PostSet, (unsigned char) **Str))
                break;
            else
                *Tmp++ = *(*Str)++;
        *Tmp = '\0';
        Node->NodeType = DATA_STRING;

        /*  Duplicate the string and add to the node.   */
        if ((Node->data.MatchStr = StrDup( Buffer )) == NULL)
        {
            fprintf(ErrFile,
                    "%s %d : Out of memory.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }
        break;
    }

    /*  Return a pointer to the new node.   */
    return( Node );
}

/*-----------------------------------------------------------------------------
| Routine   :   Enumerate() --- Enumerate a regular expression.
|
| Inputs    :   Str - Pointer to current character in source RE string.
|
| Returns   :   Pointer to regular expression node.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *Enumerate(char **Str)
{
    auto    REG_EXP_NODE    *Node;
    auto    REG_EXP_NODE    *New;
    extern  FILE            *ErrFile;

    /*  Get the regular expression. */
    Node = Unary( Str );

    /*  Test for enumeration.   */
    if ( InSet(PostSet, (unsigned char) **Str) )
    {
        /*  Check for the special case of enumerating a '.' */
        if (Node->NodeType == DATA_ANY)
        {
            /*  Modify it to be a DATA_SPAN type.   */
            Node->NodeType = DATA_SPAN;
        }
        else if (Node->NodeType == OP_L_PAREN)
        {
            fprintf(ErrFile,
                    "%s %d : Error, can not enumerate a sub expression.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }
        else
        {
            /*  Allocate an enumeration node.   */
            New = AllocRegExpNode();
            New->Right = Node;
            New->NodeType = OP_ENUM;
            Node = New;
        }

        /*  Determine enumeration value.    */
        EnumOp(Str, Node);
    }
    return( Node );
}

/*-----------------------------------------------------------------------------
| Routine   :   Catenation() --- Concatenate regular expressions.
|
| Inputs    :   Str - Pointer to current character in source RE string.
|
| Returns   :   Pointer to regular expression node.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *Catenation(char    **Str)
{
    auto        REG_EXP_NODE    *Root;
    auto        REG_EXP_NODE    *SubExpr;
    auto        REG_EXP_NODE    *Prev;
    auto        REG_EXP_NODE    *Next;

    /*  Loop, getting concatenations (and operation) of regular
    *   expressions seperated by OR's.
    */
    Prev = Root = NULL;
    for ( ; ; )
    {
        /*  Get next sub expression.    */
        SubExpr = Enumerate( Str );

        /*  Determine type of action to take.   */
        if (**Str && **Str != '|' && **Str != ')' && **Str != '$')
        {
            /*  Create new node.    */
            Next = AllocRegExpNode();
            Next->Left = SubExpr;
            Next->NodeType = OP_AND;

            /*  Link to old node.   */
            if ( Prev )
                Prev->Right = Next;
            else
                Root = Next;
            Prev = Next;
        }
        else
        {
            /*  Determine final link type.  */
            if ( Prev )
                Prev->Right = SubExpr;
            else
                Root = SubExpr;

            /*  End loop.   */
            break;
        }
    }

    /*  Return a pointer to the root node.  */
    return( Root );
}

/*-----------------------------------------------------------------------------
| Routine   :   Anchor() --- Parse the two anchoring unary operators.
|
| Inputs    :   Str     - Regular expression source string.
|
| Returns   :   Returns a pointer to the compiled regular expression.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *Anchor(char    **Str)
{
    auto        REG_EXP_NODE    *Root;
    auto        REG_EXP_NODE    *Node;

    /*  Check for begining of line anchor.  */
    if (**Str == '^')
    {
        /*  Next character. */
        (*Str)++;

        /*  Allocate a node.    */
        Root = AllocRegExpNode();
        Root->NodeType = DATA_LEFT_ANCHOR;

        /*  Get expression. */
        Root->Right = Catenation( Str );
    }
    else
        Root = Catenation( Str );

    /*  Check for end of line anchor.   */
    if (**Str == '$')
    {
        /*  Next character. */
        (*Str)++;

        /*  Allocate a node.    */
        Node = AllocRegExpNode();
        Node->NodeType = DATA_RIGHT_ANCHOR;
        Node->Right = Root;
        Root = Node;
    }

    /*  Return regular expression.  */
    return( Root );
}

/*-----------------------------------------------------------------------------
| Routine   :   Alternation() --- Parse an alternation expression.
|
| Inputs    :   Str - Pointer to current character in source RE string.
|
| Returns   :   Pointer to regular expression node.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *Alternation(char   **Str)
{
    auto        REG_EXP_NODE    *Root;
    auto        REG_EXP_NODE    *New;

    /*  Get a concatenation of regular expressions. */
    Root = Anchor( Str );

    /*  Loop, getting concatenations (and operation) of regular
    *   expressions seperated by OR's.
    */
    while (**Str == '|')
    {
        /*  Next character. */
        (*Str)++;

        /*  Allocate a node.    */
        New = AllocRegExpNode();
        New->Left = Root;
        New->NodeType = OP_OR;

        /*  Get right hand of expression.   */
        New->Right = Anchor( Str );
        Root = New;
    }

    /*  Return root of regular expression tree. */
    return( Root );
}

/*-----------------------------------------------------------------------------
| Routine   :   ReGraph() --- Convert a regular expression tree into a graph.
|
| Inputs    :   Node    - Regular expression tree node.
|
| Returns   :   Returns a pointer to the compiled regular expression.
-----------------------------------------------------------------------------*/

static
REG_EXP_NODE    *ReGraph(REG_EXP_NODE   *Node,
                         REG_EXP_NODE   *Link)
{
    auto        REG_EXP_NODE    *RetLink;
    auto        REG_EXP_NODE    *New;

    /*  Determine operation type.   */
    switch ( Node->NodeType )
    {
    case OP_L_PAREN:
        /*  Allocate an end of parentheses node.    */
        New = AllocRegExpNode();
        New->NodeType = OP_R_PAREN;
        New->SubExprNo = Node->SubExprNo;
        New->Right = Link;

        /*  Continue link.  */
        Node->Right = ReGraph(Node->Right, New);
        RetLink = Node;
        break;
    case OP_ENUM:
        /*  Continue link.  */
        Node->Left = Node->Right;
        Node->Right = Link;
        RetLink = Node;
        break;
    case OP_AND:
        /*  Traverse right, returning a pointer that can be used to
        *   link the tree into a graph.
        */
        RetLink = ReGraph(Node->Right, Link);

        /*  Go down left and link together. */
        RetLink = ReGraph(Node->Left, RetLink);

        /*  Free AND node and return the link.  */
        free( Node );
        break;
    case OP_OR:
        /*  Allocate an end of or node. */
        New = AllocRegExpNode();
        New->NodeType = END_OR;
        New->Right = Link;

        /*  Process both.   */
        Node->Right = ReGraph(Node->Right, New);
        Node->Left = ReGraph(Node->Left, New);

        /*  Return pointer to OR.   */
        RetLink = Node;
        break;
    case DATA_LEFT_ANCHOR:
        Node->Right = ReGraph(Node->Right, Link);
        RetLink = Node;
        break;
    case DATA_RIGHT_ANCHOR:
        /*  Allocate an end of parentheses node.    */
        New = Node->Right;
        Node->Right = Link;
        RetLink = ReGraph(New, Node);
        break;
    case DATA_ANY:
    case DATA_SPAN:
    case DATA_STRING:
    case DATA_SET:
        Node->Right = Link;
        RetLink = Node;
    }

    /*  Return a pointer to the tail end of the graph so far.   */
    return( RetLink );
}

/*-----------------------------------------------------------------------------
| Routine   :   ReCompile() --- Compile a regular expression.
|
| Inputs    :   Str     - Regular expression source string.
|
| Returns   :   Returns a pointer to the compiled regular expression.
-----------------------------------------------------------------------------*/

REG_EXP_NODE    *ReCompile(char *Str)
{
    auto        REG_EXP_NODE    *Root;

    /*  Construct sets, if not already constructed. */
    if (ReInitFlag == 0)
    {
        auto    unsigned char   **Str;

        /*  Create the sets.    */
        Str = (unsigned char **) &SpecStr;
        CrtSet(Str, SpecSet);
        Str = (unsigned char **) &PostFixStr;
        CrtSet(Str, PostSet);
        Str = (unsigned char **) &DecStr;
        CrtSet(Str, DecSet);

        /*  We have initialized, so set flag saying so. */
        ReInitFlag = 1;
    }

    /*  Check for leading anchor.   */
    SubExprNo = 1;
    Root = Alternation( &Str );

     /*  Print the regular expression tree.  */
#if defined( RE_TEST )
    PrtReExpr(Root, 0);
#endif

    /*  Convert to a graph. */
    Root = ReGraph(Root, NULL);

    /*  Return pointer to regular expression.   */
    return( Root );
}

/*-----------------------------------------------------------------------------
| Routine   :   FreeReExpr() --- Free the memory of a regular expression
|               graph memory.
|
| Inputs    :   ReExpr  - Regular expression digraph root.
-----------------------------------------------------------------------------*/

REG_EXP_NODE    *FreeReExpr(REG_EXP_NODE    *ReExpr)
{
    auto    REG_EXP_NODE    *Node;
    auto    REG_EXP_NODE    *EndOr;
    extern  FILE            *ErrFile;

    /*  Free the different node types.  */
    while (ReExpr && ReExpr->NodeType != END_OR)
    {
        /*  Get pointer to next node.   */
        Node = ReExpr->Right;

        /*  Select operation on node type.  */
        switch ( ReExpr->NodeType )
        {
        case OP_ENUM:
            (void) FreeReExpr( ReExpr->Left );
            break;
        case OP_OR:
            /*  Free to end of OR branch.   */
            (void) FreeReExpr( ReExpr->Right );

            /*  Free to end of OR branch, and get pointer to next
            *   node.
            */
            EndOr = FreeReExpr( ReExpr->Left );
            Node = EndOr->Right;
            free( EndOr );
            break;
        case DATA_STRING:
            free( ReExpr->data.MatchStr );
            break;
        case DATA_SET:
            free( ReExpr->data.CSet );
            break;
        case OP_AND:
        case OP_L_PAREN:
        case OP_R_PAREN:
        case DATA_LEFT_ANCHOR:
        case DATA_RIGHT_ANCHOR:
        case DATA_ANY:
        case DATA_SPAN:
            break;
        default:
            fprintf(ErrFile,
                    "%s %d : Error - illegal regular expression node type.\n",
                    __FILE__,
                    __LINE__);
            exit( 1 );
        }

        /*  Move along. */
        free( ReExpr );
        ReExpr = Node;
    }

    /*  Return pointer to end of chain. */
    return( ReExpr );
}
