#!/usr/bin/perl
#
# UniLexer.pm - Tokenizer for C/C++/C#/IDL/Java/Pascal/Eiffel/Ada files
#
# Usage:        my @tokens = UniLexer::lex("myfile.c");
#               Scans the file myfile.c into @tokens.
#
# $Id: UniLexer.pm,v 1.7 2020/01/02 11:47:44 okellogg Exp $
#
# Copyright (C) 2008-2018, O. Kellogg <okellogg@users.sourceforge.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# $Log: UniLexer.pm,v $
# Revision 1.7  2020/01/02 11:47:44  okellogg
# Fix scanning of Ada Tic in corner case expressions such as
#   V : String := Character'(''')'Image;
# - In sub tokenize for-loop of $i case $ch "'" case $language ADA case
#   (substr($text, $i + 2 , 1) eq "'"), if $ondx is valid and the text at
#   $out[$ondx] is an identifier then set $out[++$ondx] to "'".
#   This means that if the predecessor token of a first quote is an
#   identifier then the quote is parsed as a tic.
#
# Revision 1.6  2018/06/06 06:03:47  okellogg
# - In sub lex languages Ada and Eiffel, avoid misinterpreting sequence of
#   "--" embedded in a string as a real comment.
#
# Revision 1.5  2018/03/10 14:16:44  okellogg
# - In sub tokenize,
#   - For C family languages, put C preprocessor macros in two tokens:
#     The first is the macro command (e.g. "#define") and the second is
#     the entire macro content (e.g. new defined symbol and substitution
#     expression).
#   - For language Ada string literals fix joining of embedded quotation
#     marks.
# - In sub lex languages Ada and Eiffel, avoid misinterpreting "--"
#   embedded in a string as a real comment.
#
# Revision 1.4  2014/10/15 22:14:24  kellogg
# - Rename sub join_ops to hljoin (for High Level Join), remove operator
#   joining code, and pass the input list by reference instead of returning
#   a second list.
# - Perform joining of operators directly in sub tokenize.
#   Exploit whitespace in the input to avoid overzealous joining.
# - In sub tokenize,
#   - for C_FAMILY improve single/multi line comment handling robustness;
#   - for PASCAL_DELPHI implement multi line comment handling for both (* *)
#     and { } comments.
#
# Revision 1.3  2012/08/12 22:26:57  kellogg
# - Separate EIFFEL_ADA into EIFFEL and ADA due to syntactic differences.
# - Extend sub tokenize to properly handle character and string constants.
#
# Revision 1.2  2012/08/12 14:45:51  kellogg
# sub lex: On splitting string into $pre/$str/$post, add missing push
# onto @linenum to account for $str
#
# Revision 1.1  2012/08/05 22:08:05  kellogg
# - The array @lindex introduced in v0.9 was backwards.
#   Replace it by the array @linenum which is indexed by token index
#   and returns the line number (counting from 1.)
# - Add externally settable flag $promote_comments (default: true.)
#   Setting it to false suppresses all comments in the result list of sub lex.
#
# Revision 1.0  2012/07/29 14:49:09  kellogg
# sub join_ops: address warnings from running perl with "-w"
#
# Revision 0.9  2012/07/11 05:33:56  kellogg
# New array @lindex maps line numbers to token indexes
#
# Revision 0.8  2008/07/06 19:11:49  kellogg
# The comment token may contain a leading space which marks it as
# following on the same line with the previous tokens.
#
# Revision 0.7  2008/06/15 21:40:58  kellogg
# Add missing reset of $singleline_comment
#
# Revision 0.6  2008/06/15 16:23:57  kellogg
# Fix loop index management in sub join_ops.
#
#
# Ver.|  Date  | History
#-----+--------+----------------------------------------------------------------
# 0.5  20080613 Join C preprocessor directives, e.g. "#include" "<iostream>"
#
# 0.4  20080609 Move single-line comment processing from tokenize() to lex().
#               Fix multi-line processing in sub lex.
#               Add C++ scope separator "::" to @operators.
#
# 0.3  20080608 Join based numbers of Ada, e.g. 2#01001011# (sub join_ops)
#               Properly join character constants (e.g. 'A', or '\n' in C)
#

package UniLexer;
require Exporter;
@ISA = ('Exporter');
@EXPORT = ();
@EXPORT_OK = ('lex');

use vars qw( $preserve_comments @linenum );

sub C_FAMILY      () { 0 }
sub PASCAL_DELPHI () { 1 }
sub EIFFEL        () { 2 }
sub ADA           () { 3 }

# If $promote_comments is set to 0 then comments will not be promoted to the
# result list of lex()
$promote_comments = 1;

# This list is indexed in parallel with the result list from sub lex.
# It returns the line number on which a given token (as identified by its
# index in the result list from sub lex) is located.
# For example, for the following Ada code:
#    package Pkg is      -- line 1
#                        -- line 2 (empty)
#    end Pkg;            -- line 3
# the following is stored in @linenum:
# $linenum[0] = 1        token 0 = "package"
# $linenum[1] = 1        token 1 = "Pkg"
# $linenum[2] = 1        token 2 = "is"
# $linenum[3] = 3        token 3 = "end"
# $linenum[4] = 3        token 4 = "Pkg"
# $linenum[5] = 3        token 5 = ";"
@linenum = ();

my $language = C_FAMILY;
my @operators = ("<<=", ">>=", "<<", ">>", "++", "--",
                 "+=", "-=", "*=", "/=", "^=", "~=",
                 "|=", "&=", "==", "!=", "<=", ">=",
                 "||", "&&", "->", "=>", ":=", "**",
                 "::", "...", "..", "<>");
my $lineno = 0;

# Joining of operators is done in tokenize().
# Here we perform higher level joining.
sub hljoin($) {
    my $listref = shift;
    for (my $i = 1; $i < scalar(@$listref) - 1; $i++) {
        $listref->[$i] eq '.' or next;
        # join a floating point constant
        if ($listref->[$i - 1] =~ /^\d/ and
            $listref->[$i + 1] =~ /^\d/) {
            $listref->[$i - 1] .= '.' . $listref->[$i + 1];
            splice @$listref, $i, 2;
            if ($listref->[$i - 1] =~ /e$/i) {
                if ($listref->[$i] eq '+' || $listref->[$i] eq '-') {
                    $listref->[$i - 1] .= $listref->[$i];
                    splice @$listref, $i, 1;
                }
                $listref->[$i - 1] .= $listref->[$i];
                splice @$listref, $i, 1;
            }
            $i--;  # counteract loop increment
        }
    }
    if ($language == C_FAMILY && $listref->[0] eq '#') {
        # Join preprocessor directives
        if ($listref->[1] =~
            /^(include|pragma|define|undef|ifdef|ifndef|if|else|endif)$/) {
            $listref->[0] .= $listref->[1];
            splice(@$listref, 1, 1);
            # Join <file> of a #include
            if ($listref->[0] eq '#include' && $listref->[1] eq '<') {
                while (scalar(@$listref) > 2) {
                    my $tmp = $listref->[2];
                    $listref->[1] .= $tmp;
                    splice(@$listref, 2, 1);
                    last if ($tmp eq '>');
                }
            }
        }
    }
}

sub tokenize {
    my $text = shift;
    my @out = ();
    if ($language == C_FAMILY and $text =~ /^#/) {
        $text =~ s{\s*/[/*].*$}{};
        $text =~ s/^#\s+/#/;
        my $directive = $text;
        $directive =~ s/\s.*$//;
        @out = ($directive);
        unless ($text =~ /^#endif/) {
            $text =~ s/^#\w+\s+//;
            push @out, $text;  # all arguments are stored in a single element
        }
        return @out;
    }
    my $ondx = -1;
    my $in_lit = 0;
    my $in_string = 0;
    my $seen_space = 0;
    my $i = 0;
    my $len = length($text);
    for (; $i < $len; $i++) {
        my $ch = substr($text, $i, 1);
        if ($in_string) {
            $out[$ondx] .= $ch;
            if ($ch eq '"') {
                if ($language == ADA) {
                    if ($i < $len - 1) {
                        my $next_ch = substr($text, $i + 1, 1);
                        if ($next_ch eq '"') {
                            $out[$ondx] .= '"';
                            $i++;
                        } else {
                            $in_string = 0;
                        }
                    } else {
                        $in_string = 0;
                    }
                } else {
                    my $prev_ch = substr($text, $i - 1, 1);
                    if ($prev_ch ne "\\") {
                        $in_string = 0;
                    }
                }
            }
        } elsif ($ch eq '"') {
            $in_lit = 0;
            $seen_space = 0;
            $in_string = 1;
            $out[++$ondx] = $ch;
        } elsif ($ch eq "'") {
            $in_lit = 0;
            $seen_space = 0;
            if ($language == ADA) {
                if (substr($text, $i + 2 , 1) eq "'") {
                    if ($ondx >= 0 && $out[$ondx] =~ /^\w+/) {
                        $out[++$ondx] = "'";  # attribute reference
                    } else {
                        $out[++$ondx] = substr($text, $i, 3);
                        $i += 2;
                    }
                } else {
                    $out[++$ondx] = "'";  # attribute reference
                }
            } else {
                my $endx = index($text, "'", $i + 2);
                if ($endx < $i + 2) {
                    print STDERR "line $lineno: no ending ' of char literal\n";
                    return @out;
                } elsif ($language == C_FAMILY &&
                         substr($text, $endx - 1, 1) eq "\\") {
                    $endx++;  # escaped apostrophe
                }
                $out[++$ondx] = substr($text, $i, $endx - $i + 1);
                $i = $endx;
            }
        } elsif ($ch =~ /^\w$/) {
            if (! $in_lit) {
                $in_lit = 1;
                $out[++$ondx] = $ch;
            } else {
                $out[$ondx] .= $ch;
            }
            $seen_space = 0;
        } elsif ($in_lit) {
            if ($language == ADA && $ch eq '#') {
                $out[$ondx] .= $ch;  # based number
            } else {
                $in_lit = 0;
                if ($ch !~ /^\s$/) {
                    $out[++$ondx] = $ch;
                }
            }
        } elsif ($ch eq ' ' || $ch eq "\t") {
            $in_lit = 0;
            $seen_space = 1;
        } else {
            $in_lit = 0;
            my $seen_operator = 0;
            if (!$seen_space && $ondx >= 0 and $out[$ondx] =~ /^\W/) {
                # Assemble known operator.
                # To be honest this should depend on the language but
                # 1) that would complicate things and
                # 2) we're not attempting to emulate a compiler's syntax checks.
                foreach my $op (@operators) {
                    my $olen = length($op);
                    my $ostart = substr($op, 0, $olen - 1);
                    my $olast  = substr($op, $olen - 1);
                    if ($out[$ondx] eq $ostart && $ch eq $olast) {
                        $seen_operator = 1;
                        last;
                    }
                }
            }
            if ($seen_operator) {
                $out[$ondx] .= $ch;
            } else {
                $out[++$ondx] = $ch;
            }
            $seen_space = 0;
        }
    }
    hljoin(\@out);
    # print "--- tokenize:\n";
    # foreach (@out) {
    #     print "\"$_\"\n";
    # }
    # print "------------------\n";
    return @out;
}

my $multiline_comment = "";
my $singleline_comment = "";
my @tokens = ();

sub lex {
    my $inputfile = shift;
    unless (open(IN, "<$inputfile")) {
        warn "cannot open file $inputfile\n";
        return ();
    }
    if ($inputfile =~ /\.pas$/) {
        $language = PASCAL_DELPHI;
    } elsif ($inputfile =~ /\.ad[abs]$/) {
        $language = ADA;
    } elsif ($inputfile =~ /\.e$/) {
        $language = EIFFEL;
    } else {
        $language = C_FAMILY;
    }
    @tokens = ();
    @linenum = ();
    $lineno = 0;
    $singleline_comment = "";
    $multiline_comment = "";

    while (<IN>) {
        chop;
        s/^\s+//;     # no whitespace at start of line
        s/\s+$//;     # no whitespace at end of line
        $lineno++;
        next if ($_ eq "");
        my $l = $_;
        if ($multiline_comment) {
            $multiline_comment .= "$l\n";
            my $found_comment_end = 0;
            if ($language == C_FAMILY and $l =~ /\*\//) {
                $found_comment_end = 1;
            } elsif ($language == PASCAL_DELPHI &&
                     ( $l =~ /\*\)/ or $l =~ /\}/)) {
                $found_comment_end = 1;
            }
            if ($found_comment_end) {
                if ($promote_comments) {
                    push @tokens, $multiline_comment;
                    push @linenum, $lineno;
                }
                $multiline_comment = "";
            }
            next;
        } elsif ($language == C_FAMILY) {
            my $slci = index($l, "//");
            my $mlci = index($l, "/*");
            if ($slci >= 0) {
                if ($mlci >= 0) {
                    if ($slci < $mlci) {
                        $singleline_comment = substr($l, $slci);
                        $l =~ s/\s*\/\/.*$//;
                    } elsif ($l =~ /(\/\*.*\*\/)/) {
                        $singleline_comment = $1;
                        $l =~ s/\s*\/\*.*\*\///;
                    } else {
                        $multiline_comment = substr($l, $mlci);
                        $l =~ s/\s*\/\*.*$//;
                    }
                } else {
                    $singleline_comment = substr($l, $slci);
                    $l =~ s/\s*\/\/.*$//;
                }
            } elsif ($mlci >= 0) {
                if ($l =~ /(\/\*.*\*\/)/) {
                    $singleline_comment = $1;
                    $l =~ s/\s*\/\*.*\*\///;
                } else {
                    $multiline_comment = substr($l, $mlci) . "\n";
                    $l =~ s/\s*\/\*.*$//;
                }
            }
        } elsif ($language == PASCAL_DELPHI) {
            if ($l =~ /(\(\*.*\*\))/) {
                $singleline_comment = $1;
                $l =~ s/\s*\(\*.*\*\)//;
            } elsif ($l =~ /(\(\*.*)$/) {
                $multiline_comment = "$1\n";
                $l =~ s/\s*\(\*.*$//;
            }
            my $slci = index($l, "//");
            my $mlci = index($l, "(*");
            my $modern_multi = 0;
            if ($mlci < 0) {
                $mlci = index($l, "{");
                $modern_multi = 1;
            }
            if ($slci >= 0) {
                if ($mlci >= 0) {
                    if ($slci < $mlci) {
                        $singleline_comment = substr($l, $slci);
                        $l =~ s/\s*\/\/.*$//;
                    } elsif ($l =~ /(\(\*.*\*\))/) {
                        $singleline_comment = $1;
                        $l =~ s/\s*\(\*.*\*\)//;
                    } elsif ($l =~ /(\{.*\})/) {
                        $singleline_comment = $1;
                        $l =~ s/\s*\{.*\}//;
                    } else {
                        $multiline_comment = substr($l, $mlci) . "\n";
                        if ($modern_multi) {
                            $l =~ s/\s*\{.*$//;
                        } else {
                            $l =~ s/\s*\(\*.*$//;
                        }
                    }
                } else {
                    $singleline_comment = substr($l, $slci);
                    $l =~ s/\s*\/\/.*$//;
                }
            } elsif ($mlci >= 0) {
                if ($l =~ /(\(\*.*\*\))/) {
                    $singleline_comment = $1;
                    $l =~ s/\s*\(\*.*\*\)//;
                } else {
                    $multiline_comment = substr($l, $mlci) . "\n";
                    if ($modern_multi) {
                        $l =~ s/\s*\{.*$//;
                    } else {
                        $l =~ s/\s*\(\*.*$//;
                    }
                }
            }
        } elsif ($language == ADA || $language == EIFFEL) {
            my $l2 = $l;
            # In order to avoid misinterpreting "--" embedded in a string
            # as a real comment, we count the number of quotation marks
            # preceding the "--". If the number is odd then the "--" is
            # known to be embedded in a string.
            # However, a quotation mark may also appear as a single
            # character literal, i.e. '"'.  As this would confuse the
            # counting of quotation marks, we create a copy of the line,
            # and in the copy we blank out any '"'.  The copy is only
            # used for counting the quotation marks.
            $l2 =~ s/'"'/.../g;  # blank out '"' but preserve line length
            my $absi = 0;
            my $reli;
            while (($reli = index(substr($l2, $absi), "--")) >= 0) {
                if ($reli == 0) {
                    $singleline_comment = substr($l, $absi);
                    if ($absi == 0) {
                        $l = "";
                    } else {
                        $l = substr($l, 0, $absi);
                    }
                    last;
                }
                $absi += $reli;
                my $n_qmarks = substr($l2, 0, $absi) =~ tr/"/"/;
                if ($n_qmarks % 2 == 0) {
                    $singleline_comment = substr($l, $absi);
                    $l = substr($l, 0, $absi);
                    last;
                }
                $reli = index(substr($l2, $absi + 2), '"');
                $absi += $reli + 1;
            }
        }
        if ($l =~ /^\s*$/) {
            if ($singleline_comment ne "") {
                if ($promote_comments) {
                    push @tokens, $singleline_comment;
                    push @linenum, $lineno;
                }
                $singleline_comment = "";
            }
            next;
        }
        my @tok = tokenize($l);
        push @tokens, @tok;
        foreach (@tok) {
            push @linenum, $lineno;
        }
        if ($singleline_comment ne "") {
            if ($promote_comments) {
                push @tokens, " $singleline_comment";
                # The leading space before the comment marks it as
                # following on the same line with the previous tokens.
            }
            $singleline_comment = "";
        }
    }
    close IN;
    return @tokens;
}

1;