#!/usr/bin/perl # # UniLexer.pm - Tokenizer for C/C++/C#/IDL/Java/Pascal/Eiffel/Ada files # # Usage: my @tokens = UniLexer::lex("myfile.c"); # Scans the file myfile.c into @tokens. # # $Id: UniLexer.pm,v 1.7 2020/01/02 11:47:44 okellogg Exp $ # # Copyright (C) 2008-2018, O. Kellogg # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # $Log: UniLexer.pm,v $ # Revision 1.7 2020/01/02 11:47:44 okellogg # Fix scanning of Ada Tic in corner case expressions such as # V : String := Character'(''')'Image; # - In sub tokenize for-loop of $i case $ch "'" case $language ADA case # (substr($text, $i + 2 , 1) eq "'"), if $ondx is valid and the text at # $out[$ondx] is an identifier then set $out[++$ondx] to "'". # This means that if the predecessor token of a first quote is an # identifier then the quote is parsed as a tic. # # Revision 1.6 2018/06/06 06:03:47 okellogg # - In sub lex languages Ada and Eiffel, avoid misinterpreting sequence of # "--" embedded in a string as a real comment. # # Revision 1.5 2018/03/10 14:16:44 okellogg # - In sub tokenize, # - For C family languages, put C preprocessor macros in two tokens: # The first is the macro command (e.g. "#define") and the second is # the entire macro content (e.g. new defined symbol and substitution # expression). # - For language Ada string literals fix joining of embedded quotation # marks. # - In sub lex languages Ada and Eiffel, avoid misinterpreting "--" # embedded in a string as a real comment. # # Revision 1.4 2014/10/15 22:14:24 kellogg # - Rename sub join_ops to hljoin (for High Level Join), remove operator # joining code, and pass the input list by reference instead of returning # a second list. # - Perform joining of operators directly in sub tokenize. # Exploit whitespace in the input to avoid overzealous joining. # - In sub tokenize, # - for C_FAMILY improve single/multi line comment handling robustness; # - for PASCAL_DELPHI implement multi line comment handling for both (* *) # and { } comments. # # Revision 1.3 2012/08/12 22:26:57 kellogg # - Separate EIFFEL_ADA into EIFFEL and ADA due to syntactic differences. # - Extend sub tokenize to properly handle character and string constants. # # Revision 1.2 2012/08/12 14:45:51 kellogg # sub lex: On splitting string into $pre/$str/$post, add missing push # onto @linenum to account for $str # # Revision 1.1 2012/08/05 22:08:05 kellogg # - The array @lindex introduced in v0.9 was backwards. # Replace it by the array @linenum which is indexed by token index # and returns the line number (counting from 1.) # - Add externally settable flag $promote_comments (default: true.) # Setting it to false suppresses all comments in the result list of sub lex. # # Revision 1.0 2012/07/29 14:49:09 kellogg # sub join_ops: address warnings from running perl with "-w" # # Revision 0.9 2012/07/11 05:33:56 kellogg # New array @lindex maps line numbers to token indexes # # Revision 0.8 2008/07/06 19:11:49 kellogg # The comment token may contain a leading space which marks it as # following on the same line with the previous tokens. # # Revision 0.7 2008/06/15 21:40:58 kellogg # Add missing reset of $singleline_comment # # Revision 0.6 2008/06/15 16:23:57 kellogg # Fix loop index management in sub join_ops. # # # Ver.| Date | History #-----+--------+---------------------------------------------------------------- # 0.5 20080613 Join C preprocessor directives, e.g. "#include" "" # # 0.4 20080609 Move single-line comment processing from tokenize() to lex(). # Fix multi-line processing in sub lex. # Add C++ scope separator "::" to @operators. # # 0.3 20080608 Join based numbers of Ada, e.g. 2#01001011# (sub join_ops) # Properly join character constants (e.g. 'A', or '\n' in C) # package UniLexer; require Exporter; @ISA = ('Exporter'); @EXPORT = (); @EXPORT_OK = ('lex'); use vars qw( $preserve_comments @linenum ); sub C_FAMILY () { 0 } sub PASCAL_DELPHI () { 1 } sub EIFFEL () { 2 } sub ADA () { 3 } # If $promote_comments is set to 0 then comments will not be promoted to the # result list of lex() $promote_comments = 1; # This list is indexed in parallel with the result list from sub lex. # It returns the line number on which a given token (as identified by its # index in the result list from sub lex) is located. # For example, for the following Ada code: # package Pkg is -- line 1 # -- line 2 (empty) # end Pkg; -- line 3 # the following is stored in @linenum: # $linenum[0] = 1 token 0 = "package" # $linenum[1] = 1 token 1 = "Pkg" # $linenum[2] = 1 token 2 = "is" # $linenum[3] = 3 token 3 = "end" # $linenum[4] = 3 token 4 = "Pkg" # $linenum[5] = 3 token 5 = ";" @linenum = (); my $language = C_FAMILY; my @operators = ("<<=", ">>=", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "^=", "~=", "|=", "&=", "==", "!=", "<=", ">=", "||", "&&", "->", "=>", ":=", "**", "::", "...", "..", "<>"); my $lineno = 0; # Joining of operators is done in tokenize(). # Here we perform higher level joining. sub hljoin($) { my $listref = shift; for (my $i = 1; $i < scalar(@$listref) - 1; $i++) { $listref->[$i] eq '.' or next; # join a floating point constant if ($listref->[$i - 1] =~ /^\d/ and $listref->[$i + 1] =~ /^\d/) { $listref->[$i - 1] .= '.' . $listref->[$i + 1]; splice @$listref, $i, 2; if ($listref->[$i - 1] =~ /e$/i) { if ($listref->[$i] eq '+' || $listref->[$i] eq '-') { $listref->[$i - 1] .= $listref->[$i]; splice @$listref, $i, 1; } $listref->[$i - 1] .= $listref->[$i]; splice @$listref, $i, 1; } $i--; # counteract loop increment } } if ($language == C_FAMILY && $listref->[0] eq '#') { # Join preprocessor directives if ($listref->[1] =~ /^(include|pragma|define|undef|ifdef|ifndef|if|else|endif)$/) { $listref->[0] .= $listref->[1]; splice(@$listref, 1, 1); # Join of a #include if ($listref->[0] eq '#include' && $listref->[1] eq '<') { while (scalar(@$listref) > 2) { my $tmp = $listref->[2]; $listref->[1] .= $tmp; splice(@$listref, 2, 1); last if ($tmp eq '>'); } } } } } sub tokenize { my $text = shift; my @out = (); if ($language == C_FAMILY and $text =~ /^#/) { $text =~ s{\s*/[/*].*$}{}; $text =~ s/^#\s+/#/; my $directive = $text; $directive =~ s/\s.*$//; @out = ($directive); unless ($text =~ /^#endif/) { $text =~ s/^#\w+\s+//; push @out, $text; # all arguments are stored in a single element } return @out; } my $ondx = -1; my $in_lit = 0; my $in_string = 0; my $seen_space = 0; my $i = 0; my $len = length($text); for (; $i < $len; $i++) { my $ch = substr($text, $i, 1); if ($in_string) { $out[$ondx] .= $ch; if ($ch eq '"') { if ($language == ADA) { if ($i < $len - 1) { my $next_ch = substr($text, $i + 1, 1); if ($next_ch eq '"') { $out[$ondx] .= '"'; $i++; } else { $in_string = 0; } } else { $in_string = 0; } } else { my $prev_ch = substr($text, $i - 1, 1); if ($prev_ch ne "\\") { $in_string = 0; } } } } elsif ($ch eq '"') { $in_lit = 0; $seen_space = 0; $in_string = 1; $out[++$ondx] = $ch; } elsif ($ch eq "'") { $in_lit = 0; $seen_space = 0; if ($language == ADA) { if (substr($text, $i + 2 , 1) eq "'") { if ($ondx >= 0 && $out[$ondx] =~ /^\w+/) { $out[++$ondx] = "'"; # attribute reference } else { $out[++$ondx] = substr($text, $i, 3); $i += 2; } } else { $out[++$ondx] = "'"; # attribute reference } } else { my $endx = index($text, "'", $i + 2); if ($endx < $i + 2) { print STDERR "line $lineno: no ending ' of char literal\n"; return @out; } elsif ($language == C_FAMILY && substr($text, $endx - 1, 1) eq "\\") { $endx++; # escaped apostrophe } $out[++$ondx] = substr($text, $i, $endx - $i + 1); $i = $endx; } } elsif ($ch =~ /^\w$/) { if (! $in_lit) { $in_lit = 1; $out[++$ondx] = $ch; } else { $out[$ondx] .= $ch; } $seen_space = 0; } elsif ($in_lit) { if ($language == ADA && $ch eq '#') { $out[$ondx] .= $ch; # based number } else { $in_lit = 0; if ($ch !~ /^\s$/) { $out[++$ondx] = $ch; } } } elsif ($ch eq ' ' || $ch eq "\t") { $in_lit = 0; $seen_space = 1; } else { $in_lit = 0; my $seen_operator = 0; if (!$seen_space && $ondx >= 0 and $out[$ondx] =~ /^\W/) { # Assemble known operator. # To be honest this should depend on the language but # 1) that would complicate things and # 2) we're not attempting to emulate a compiler's syntax checks. foreach my $op (@operators) { my $olen = length($op); my $ostart = substr($op, 0, $olen - 1); my $olast = substr($op, $olen - 1); if ($out[$ondx] eq $ostart && $ch eq $olast) { $seen_operator = 1; last; } } } if ($seen_operator) { $out[$ondx] .= $ch; } else { $out[++$ondx] = $ch; } $seen_space = 0; } } hljoin(\@out); # print "--- tokenize:\n"; # foreach (@out) { # print "\"$_\"\n"; # } # print "------------------\n"; return @out; } my $multiline_comment = ""; my $singleline_comment = ""; my @tokens = (); sub lex { my $inputfile = shift; unless (open(IN, "<$inputfile")) { warn "cannot open file $inputfile\n"; return (); } if ($inputfile =~ /\.pas$/) { $language = PASCAL_DELPHI; } elsif ($inputfile =~ /\.ad[abs]$/) { $language = ADA; } elsif ($inputfile =~ /\.e$/) { $language = EIFFEL; } else { $language = C_FAMILY; } @tokens = (); @linenum = (); $lineno = 0; $singleline_comment = ""; $multiline_comment = ""; while () { chop; s/^\s+//; # no whitespace at start of line s/\s+$//; # no whitespace at end of line $lineno++; next if ($_ eq ""); my $l = $_; if ($multiline_comment) { $multiline_comment .= "$l\n"; my $found_comment_end = 0; if ($language == C_FAMILY and $l =~ /\*\//) { $found_comment_end = 1; } elsif ($language == PASCAL_DELPHI && ( $l =~ /\*\)/ or $l =~ /\}/)) { $found_comment_end = 1; } if ($found_comment_end) { if ($promote_comments) { push @tokens, $multiline_comment; push @linenum, $lineno; } $multiline_comment = ""; } next; } elsif ($language == C_FAMILY) { my $slci = index($l, "//"); my $mlci = index($l, "/*"); if ($slci >= 0) { if ($mlci >= 0) { if ($slci < $mlci) { $singleline_comment = substr($l, $slci); $l =~ s/\s*\/\/.*$//; } elsif ($l =~ /(\/\*.*\*\/)/) { $singleline_comment = $1; $l =~ s/\s*\/\*.*\*\///; } else { $multiline_comment = substr($l, $mlci); $l =~ s/\s*\/\*.*$//; } } else { $singleline_comment = substr($l, $slci); $l =~ s/\s*\/\/.*$//; } } elsif ($mlci >= 0) { if ($l =~ /(\/\*.*\*\/)/) { $singleline_comment = $1; $l =~ s/\s*\/\*.*\*\///; } else { $multiline_comment = substr($l, $mlci) . "\n"; $l =~ s/\s*\/\*.*$//; } } } elsif ($language == PASCAL_DELPHI) { if ($l =~ /(\(\*.*\*\))/) { $singleline_comment = $1; $l =~ s/\s*\(\*.*\*\)//; } elsif ($l =~ /(\(\*.*)$/) { $multiline_comment = "$1\n"; $l =~ s/\s*\(\*.*$//; } my $slci = index($l, "//"); my $mlci = index($l, "(*"); my $modern_multi = 0; if ($mlci < 0) { $mlci = index($l, "{"); $modern_multi = 1; } if ($slci >= 0) { if ($mlci >= 0) { if ($slci < $mlci) { $singleline_comment = substr($l, $slci); $l =~ s/\s*\/\/.*$//; } elsif ($l =~ /(\(\*.*\*\))/) { $singleline_comment = $1; $l =~ s/\s*\(\*.*\*\)//; } elsif ($l =~ /(\{.*\})/) { $singleline_comment = $1; $l =~ s/\s*\{.*\}//; } else { $multiline_comment = substr($l, $mlci) . "\n"; if ($modern_multi) { $l =~ s/\s*\{.*$//; } else { $l =~ s/\s*\(\*.*$//; } } } else { $singleline_comment = substr($l, $slci); $l =~ s/\s*\/\/.*$//; } } elsif ($mlci >= 0) { if ($l =~ /(\(\*.*\*\))/) { $singleline_comment = $1; $l =~ s/\s*\(\*.*\*\)//; } else { $multiline_comment = substr($l, $mlci) . "\n"; if ($modern_multi) { $l =~ s/\s*\{.*$//; } else { $l =~ s/\s*\(\*.*$//; } } } } elsif ($language == ADA || $language == EIFFEL) { my $l2 = $l; # In order to avoid misinterpreting "--" embedded in a string # as a real comment, we count the number of quotation marks # preceding the "--". If the number is odd then the "--" is # known to be embedded in a string. # However, a quotation mark may also appear as a single # character literal, i.e. '"'. As this would confuse the # counting of quotation marks, we create a copy of the line, # and in the copy we blank out any '"'. The copy is only # used for counting the quotation marks. $l2 =~ s/'"'/.../g; # blank out '"' but preserve line length my $absi = 0; my $reli; while (($reli = index(substr($l2, $absi), "--")) >= 0) { if ($reli == 0) { $singleline_comment = substr($l, $absi); if ($absi == 0) { $l = ""; } else { $l = substr($l, 0, $absi); } last; } $absi += $reli; my $n_qmarks = substr($l2, 0, $absi) =~ tr/"/"/; if ($n_qmarks % 2 == 0) { $singleline_comment = substr($l, $absi); $l = substr($l, 0, $absi); last; } $reli = index(substr($l2, $absi + 2), '"'); $absi += $reli + 1; } } if ($l =~ /^\s*$/) { if ($singleline_comment ne "") { if ($promote_comments) { push @tokens, $singleline_comment; push @linenum, $lineno; } $singleline_comment = ""; } next; } my @tok = tokenize($l); push @tokens, @tok; foreach (@tok) { push @linenum, $lineno; } if ($singleline_comment ne "") { if ($promote_comments) { push @tokens, " $singleline_comment"; # The leading space before the comment marks it as # following on the same line with the previous tokens. } $singleline_comment = ""; } } close IN; return @tokens; } 1;