#! /usr/bin/env perl use strict; use warnings; use Term::ANSIColor; use Encode; use POSIX qw(locale_h); use locale; setlocale(LC_CTYPE,"no_NO"); my $versionnumber="2.2"; my $versiondate="2009 Apr 30"; ###### Set CMD specific settings and variables # Options and states my $verbose=0; my $showcodes=1; my $showstates=0; my $showsubcounts=0; my $htmlstyle=0; my $includeTeX=0; my $briefsum=0; my @sumweights; my $utf8flag=0; # Global variables my $blankline=0; my $errorcount=0; # CMD specific global variables my $totalflag=0; my @filelist; my $workdir; my $globalworkdir=""; ###### Set global settings and variables ### Macros for headers # Macros that identify headers: i.e. following token or # {...} is counted as header. The =>[2] indicates transition to # state 2 which is used within headers (although the value is # actually never used). This is copied to %TeXmacro and the # only role of defining it here is that the counter for the number # of headers is incremented by one. my %TeXheader=('\title'=>[2],'\part'=>[2],'\chapter'=>[2], '\section'=>[2],'\subsection'=>[2],'\subsubsection'=>[2], '\paragraph'=>[2],'\subparagraph'=>[2]); ### How many tokens to gobble after macro # Each macro is assumed to gobble up a given number of # tokens (or {...} groups), as well as options [...] before, within # and after. The %TeXmacro hash gives a link from a macro # (or beginNAME for begin-end groups without the backslash) # to either an integer giving the number of tokens to ignore # or to an array (specified as [num,num,...]) of length N where # N is the number of tokens to be read with the macro and the # array values tell how each is to be interpreted (see the status # values: 0=ignore, 1=count, etc.). Thus specifying a number N is # equivalent to specifying an array [0,...,0] of N zeros. # # For macros not specified here, the default value is 0: i.e. # no tokens are excluded, but [...] options are. Header macros # specified in %TeXheader are automatically included here. my %TeXmacro=(%TeXheader, '\documentclass'=>1,'\documentstyle'=>1,'\usepackage'=>1, '\hyphenation'=>1, '\pagestyle'=>1,'\thispagestyle'=>1, '\pagenumbering'=>1,'\markboth'=>1, '\markright'=>1, '\newcommand'=>[-3,-3],'\renewcommand'=>[-3,-3], '\newenvironment'=>[-3,-3,-3], 'renewenvironment'=>[-3,-3,-3], '\newfont'=>2,'\newtheorem'=>2,'\bibliographystyle'=>1, '\bibliography'=>1, '\parbox'=>1, '\marginpar'=>[3],'\makebox'=>0, '\raisebox'=>1, '\framebox'=>0, '\newsavebox'=>1, '\sbox'=>1, '\savebox'=>2, '\usebox'=>1,'\rule'=>2, '\footnote'=>[3],'\label'=>1, '\ref'=>1, '\pageref'=>1, '\bibitem'=>1, '\cite'=>1, '\citep'=>1, '\citet'=>1, '\citeauthor'=>1, '\citealt'=>1, '\nocite'=>1, '\eqlabel'=>1, '\eqref'=>1,'\hspace'=>1, '\vspace'=>1, '\addvspace'=>1, '\input'=>1, '\include'=>1, '\includeonly'=>1,'\includegraphics'=>1, '\newlength'=>1, '\setlength'=>2, '\addtolength'=>2,'\settodepth'=>2, '\settoheight'=>2, '\settowidth'=>2,'\newcounter'=>1, '\setcounter'=>2, '\addtocounter'=>2,'\stepcounter'=>1, '\refstepcounter'=>1, '\usecounter'=>1, '\alph'=>1, '\arabic'=>1, '\fnsymbol'=>1, '\roman'=>1, '\value'=>1, '\cline'=>1, '\multicolumn'=>3,'\typeout'=>1, '\typein'=>1, 'beginlist'=>2, 'beginminipage'=>1, 'begintabular'=>1, 'beginthebibliography'=>1,'beginlrbox'=>1, '\begin'=>1,'\end'=>1,'\title'=>[2]); ### Macros that should be counted as one or more words # Macros that represent text may be declared here. The value gives # the number of words the macro represents. my %TeXmacroword=('\LaTeX'=>1,'\TeX'=>1); ### Macros that are counted within the preamble # The preamble is the text between \documentclass and \begin{document}. # Text and macros in the preamble is ignored unless specified here. The # value is the status (1=text, 2=header, etc.) they should be interpreted as. # Note that only the first unit (token or {...} block) is counted. my %TeXpreamble=('\title'=>[2], '\newcommand'=>[-3,-3],'\renewcommand'=>[-3,-3], '\newenvironment'=>[-3,-3,-3], 'renewenvironment'=>[-3,-3,-3], ); ### Begin-End groups # Identified as begin-end groups, and define =>state. The # states used corresponds to the elements of the count array, and # are: # 0: Not included # 1: Text, word included in text count # 2: Header, words included in header count # 3: Float caption, words included in float caption count # 6: Inline mathematics, words not counted # 7: Displayed mathematics, words not counted # -1: Float, not included, but looks for captions # # 4 and 5 are used to count number of headers and floats # and are not used as states. # # Groups that are not defined will be counted as the surrounding text. # # Note that some environments may only exist within math-mode, and # therefore need not be defined here: in fact, they should not as it # is not clear if they will be in inlined or displayed math. # my %TeXgroup=('document'=>1,'letter'=>1,'titlepage'=>0, 'center'=>1,'flushleft'=>1,'flushright'=>1, 'abstract'=>1,'quote'=>1,'quotation'=>1,'verse'=>1,'minipage'=>1,'verbatim'=>1, 'description'=>1,'enumerate'=>1,'itemize'=>1,'list'=>1, 'theorem'=>1,'lemma'=>1,'definition'=>1,'corollary'=>1,'example'=>1, 'math'=>6,'displaymath'=>7,'equation'=>7,'eqnarray'=>7,'align'=>7, 'figure'=>-1,'float'=>-1,'picture'=>-1,'table'=>-1, 'tabbing'=>0,'tabular'=>0,'thebibliography'=>0,'lrbox'=>0); ### In floats: include only specific macros # Macros used to identify caption text within floats. my %TeXfloatinc=('\caption'=>[3]); ### Macros for including tex files # Allows \macro{file} or \macro file. If the value is 0, the filename will # be used as is; if it is 1, the filetype .tex will be added if the # filename is without filetype; if it is 2, the filetype .tex will be added. my %TeXfileinclude=('\input'=>1,'\include'=>2); ### Count labels # Labels used to describe the counts my @countlabel=('Files','Words in text','Words in headers', 'Words in float captions','Number of headers','Number of floats', 'Number of math inlines','Number of math displayed'); ### Break points # Definition of macros that define break points that start a new subcount. # The values given are used as labels. my %BreakPointsOptions; $BreakPointsOptions{'none'}={}; $BreakPointsOptions{'part'}={%{$BreakPointsOptions{'none'}},'\part'=>'Part'}; $BreakPointsOptions{'chapter'}={%{$BreakPointsOptions{'part'}},'\chapter'=>'Chapter'}; $BreakPointsOptions{'section'}={%{$BreakPointsOptions{'chapter'}},'\section'=>'Section'}; $BreakPointsOptions{'subsection'}={%{$BreakPointsOptions{'section'}},'\subsection'=>'Subsection'}; $BreakPointsOptions{'default'}=$BreakPointsOptions{'subsection'}; my %BreakPoints=%{$BreakPointsOptions{'none'}}; ### Print styles # Definition of different print styles: maps of class labels # to ANSI codes. Class labels are as used by HTML styles. my @STYLES=(); my %STYLE; $STYLES[0]={'error'=>'bold red'}; $STYLES[1]={%{$STYLES[0]}, 'word1'=>'blue','word2'=>'bold blue','word3'=>'blue', 'grouping'=>'red','document'=>'red','mathgroup'=>'magenta', 'state'=>'cyan underline','sumcount'=>'yellow'}; $STYLES[2]={%{$STYLES[1]}, 'command'=>'green','exclcommand'=>'yellow','exclgroup'=>'yellow','exclmath'=>'yellow', 'ignore'=>'cyan'}; $STYLES[3]={%{$STYLES[2]}, 'tc'=>'bold yellow','comment'=>'yellow','option'=>'yellow', 'fileinclude'=>'bold green'}; $STYLES[4]={%{$STYLES[3]}}; ### Word regexp pattern list # List of regexp patterns that should be analysed as words. # Use @ to represent a letter, will be substituted with $LetterPattern. my @WordPatterns=('(@+\.)+@+\.?','@+([\-\']@+)*'); my $specialchars='\\\\(ae|AE|o|O|aa|AA)'; my $modifiedchars='\\\\[\'\"\`\~\^\=](\w|\{\w\})'; my $LetterPattern='\w'; my $LetterPatternRelaxed='([\w\-\']|'.$modifiedchars.'|'.$specialchars.'(\{\})?|\{'.$specialchars.'\}|\{\w\})'; my %NamedWordPattern; $NamedWordPattern{'chinese'}='\p{script=Han}'; $NamedWordPattern{'japanese'}='(\p{script=Han}|\p{script=Hiragana}|\p{script=Katakana})'; ### Macro option regexp list # List of regexp patterns to be gobbled as macro option in and after # a macro. my @MacroOptionPatterns=('\[(\w|[,\-\s\~\.\:\;\+\?\*\_\=])*\]'); my @MacroOptionPatternsRelaxed=('\[[^\[\]\n]*\]'); ###### Main script ################################################### MAIN(@ARGV); ################################################### ######### ######### Main routines ######### # MAIN ROUTINE: Handle arguments, then parse files sub MAIN { my @args=@_; my @toplevelfiles=Parse_Arguments(@args); Apply_Options(); if (scalar(@toplevelfiles)==0) { conditional_print_help_style() || print_error("No files specified.","p","error"); } else { conditional_print_help_style(); my $totalcount=parse_file_list(@toplevelfiles); conditional_print_total($totalcount); } Report_ErrorCount(); Close_Output(); } # Checks arguments, exits on exit condition sub Check_Arguments { my @args=@_; if (!@args) { print_version(); print_syntax(); print_reference(); exit; } elsif ($args[0]=~/^(\-?\-(h|\?|help)|\/(\?|h))$/) { print_help(); exit; } elsif ($args[0]=~/^\-?\-(ver|version)$/) { print_version(); exit; } elsif ($args[0]=~/^\-?\-(lic|license)$/) { print_license(); exit; } return 1; } # Parses arguments, sets options (global) and returns file list sub Parse_Arguments { my @args=@_; Check_Arguments(@args); my @files; foreach my $arg (@ARGV) { if (Parse_Option($arg)) {next;} if ($arg=~/^\-/) { print 'Invalid opton '.$arg."\n"; print_syntax(); exit; } $arg=~s/\\/\//g; push @files,$arg; } return @files; } # Parse individual option parameters sub Parse_Option { my $arg=shift @_; return parse_options_parsing($arg) || parse_options_sums($arg) || parse_options_output($arg) || parse_options_format($arg) ; } sub parse_options_parsing { my $arg=shift @_; if ($arg eq '-inc') {$includeTeX=1;} elsif ($arg eq '-noinc') {$includeTeX=0;} elsif ($arg eq '-dir') {$globalworkdir=undef;} elsif ($arg=~/^-dir=(.*)$/) {$globalworkdir=$1;} elsif ($arg=~/^-(utf8|unicode)$/) {$utf8flag=1;} elsif ($arg=~/^-(ch|chinese|zhongwen)$/) { $utf8flag=1; @WordPatterns=($NamedWordPattern{'chinese'},@WordPatterns); } elsif ($arg=~/^-(jp|japanese)$/) { $utf8flag=1; @WordPatterns=($NamedWordPattern{'japanese'},@WordPatterns); } elsif ($arg eq '-relaxed') { @MacroOptionPatterns=@MacroOptionPatternsRelaxed; $LetterPattern=$LetterPatternRelaxed; } else {return 0;} return 1; } sub parse_options_sums { my $arg=shift @_; if ($arg=~/^-sum(=(.+))?$/) {option_sum($2);} elsif ($arg=~/^-(sub|subcounts?)(=(.+))?$/) {option_subcount($3);} else {return 0;} return 1; } sub option_subcount { my $arg=shift @_; $showsubcounts=1; if (!defined $arg) { %BreakPoints=%{$BreakPointsOptions{'default'}}; } elsif (my $option=$BreakPointsOptions{$arg}) { %BreakPoints=%{$option}; } else { print STDERR "Warning: Option value ".$arg." not valid, using default instead.\n"; %BreakPoints=%{$BreakPointsOptions{'default'}}; } } sub option_sum { my $arg=shift @_; if (!defined $arg) { @sumweights=(1,1,1,0,0,1,1); } elsif ($arg=~/^(\d+(,\d+){0,6})$/) { @sumweights=split(',',$1); } else { print STDERR "Warning: Option value ".$arg." not valid, ignoring option.\n"; } } sub parse_options_format { my $arg=shift @_; if ($arg eq '-brief') {$briefsum=1;} elsif ($arg eq '-total') {$totalflag=1;} elsif ($arg eq '-1') {$briefsum=1;$totalflag=1;$verbose=-1;} elsif ($arg eq "-html" ) {option_no_colours();$htmlstyle = 2;} elsif ($arg eq "-htmlcore" ) {option_no_colours();$htmlstyle = 1;} elsif ($arg=~/^\-(nocol|nc$)/) {option_no_colours();} elsif ($arg eq '-codes') { $showcodes=2; if ($verbose==0) {$verbose=3;} } elsif ($arg eq '-nocodes') {$showcodes=0;} else {return 0;} return 1; } sub parse_options_output { my $arg=shift @_; if ($arg eq "-v0") {$verbose=0;} elsif ($arg eq "-v1") {$verbose=1;} elsif ($arg eq '-vv' || $arg eq '-v2') {$verbose=2;} elsif ($arg eq '-vvv' || $arg eq '-v3' || $arg eq '-v') {$verbose=3;} elsif ($arg eq '-vvvv' || $arg eq '-v4') {$verbose=3; $showstates=1;} elsif ($arg =~ /^\-showstates?$/ ) {$showstates=1;} elsif ($arg =~ /^-(q|-?quiet)$/ ) {$verbose=-1;} else {return 0;} return 1; } # Parse file list and return total count sub parse_file_list { my @filelist=@_; my $listtotalcount=new_count("TOTAL COUNT"); for my $file (<@filelist>) { my $filetotalcount=parse_file($file); add_count($listtotalcount,$filetotalcount); } return $listtotalcount; } # Parse file and included files, and return total count sub parse_file { my $file=shift @_; $workdir=$globalworkdir; if (!defined $workdir) { $workdir=$file; $workdir =~ s/^((.*[\\\/])?)[^\\\/]+$/$1/; } @filelist=($file); if ($htmlstyle) {print "\n
\n";} my $filetotalcount=new_count("SUM COUNT FOR ".$file); foreach my $f (@filelist) { my $tex=TeXfile($f); my $fpath=$f; $fpath=~s/^((.*[\\\/])?)[^\\\/]+$/$1/; if (!defined $tex) { print STDERR "File not found or not readable: ".$f."\n"; formatprint("File not found or not readable: ".$f."\n","p","error"); } else { parse($tex); my $filecount=add_subcount($tex); if (!$totalflag) { print_count($filecount); print "\n"; } add_count($filetotalcount,$filecount); } } if ($htmlstyle) {print "
\n\n";} return $filetotalcount; } ###### ###### Subroutines ###### ###### CMD specific implementations sub add_file_to_list { my $fname=shift @_; push @filelist,$workdir.$fname; } sub print_with_style { my ($text,$style,$colour)=@_; #if ($utf8flag || $htmlstyle) {utf8::encode($text);} if ($htmlstyle) { print "".$text.""; } else { print Term::ANSIColor::colored($text,$colour); } } sub option_no_colours { $ENV{'ANSI_COLORS_DISABLED'} = 1; } # Print count (total) if conditions are met sub conditional_print_total { my $sumcount=shift @_; if ($totalflag || get_count($sumcount,0)>1) { if ($totalflag && $briefsum && @sumweights) { print total_count($sumcount); } else { if ($htmlstyle) { formatprint("Total word count",'h2'); } print_count($sumcount); } } } ###### Option handling # Apply options to set values sub Apply_Options { %STYLE=%{$STYLES[$verbose]}; if ($utf8flag) {binmode STDOUT,':utf8';} if ($htmlstyle>1) {html_head();} foreach (@WordPatterns) { s/\@/$LetterPattern/g; } } ###### TeX code handle sub TeXfile { my $filename=shift @_; my $file=read_file($filename) || return undef; return TeXcode($file,$filename); } sub read_file { my $filename=shift @_; if ($utf8flag) { open(FH,"<:utf8",$filename) || return undef; } else { open(FH,"<".$filename) || return undef; } if ($verbose>0) { formatprint("File: ".$filename."\n",'h2'); $blankline=0; } my @text=; close(FH); my $latexcode=join('',@text); if ($utf8flag) { $latexcode =~ s/^\x{feff}//; } return $latexcode; } ###### Parsing routines # Make TeXcode handle sub TeXcode { my ($texcode,$filename,$title)=@_; my %TeX=(); $TeX{'filename'}=$filename; if (!defined $filename) { $TeX{'filepath'}=''; } elsif ($filename=~/^(.*[\\\/])[^\\\/]+$/) { $TeX{'filepath'}=$1; } else { $TeX{'filepath'}=''; } if (defined $title) {} elsif (defined $filename) {$title="FILE: ".$filename;} else {$title="Word count";} $TeX{'line'}=$texcode; $TeX{'next'}=undef; $TeX{'type'}=undef; $TeX{'style'}=undef; $TeX{'printstate'}=undef; $TeX{'eof'}=0; my $countsum=new_count($title); $TeX{'countsum'}=$countsum; my $count=new_count("_top_"); $TeX{'count'}=$count; inc_count(\%TeX,0); my @countlist=(); $TeX{'countlist'}=\@countlist; $countsum->{'subcounts'}=\@countlist; return \%TeX; } # Parse LaTeX document sub parse { my ($tex)=@_; if ($htmlstyle && $verbose) {print "

\n";} while (!($tex->{'eof'})) { parse_unit($tex,1); } if ($htmlstyle && $verbose) {print "

\n";} } # Parse one block or unit sub parse_unit { # Status: # 0 = exclude from count # 1 = text # 2 = header text # 3 = float text # -1 = float (exclude) # -2 = strong exclude, ignore begin-end groups # -3 = stronger exclude, do not parse macro parameters # -9 = preamble (between \documentclass and \begin{document}) my ($tex,$status,$end)=@_; if (!defined $status) { print_error("CRITICAL ERROR: Undefined parser status!"); exit; } elsif (ref($status) eq 'ARRAY') { print_error("CRITICAL ERROR: Invalid parser status!"); exit; } my $substat; if ($showstates) { if (defined $end) { $tex->{'printstate'}=':'.$status.':'.$end.':'; } else { $tex->{'printstate'}=':'.$status.':'; } flush_next($tex); } while (defined (my $next=next_token($tex))) { # parse next token; or tokens until match with $end set_style($tex,"ignore"); if ((defined $end) && ($end eq $next)) { # end of unit return; } elsif (!defined $next) { print_error("ERROR: End of file while waiting for ".$end); return; } if ($status==-9 && $next eq '\begin' && $tex->{'line'}=~/^\{\s*document\s*\}/) { # \begin{document} $status=1; } if ($next eq '\documentclass') { # starts preamble set_style($tex,'document'); gobble_option($tex); gobble_macro_parms($tex,1); while (!($tex->{'eof'})) { parse_unit($tex,-9); } } elsif ($tex->{'type'}==666) { # parse TC instructions parse_tc($tex); } elsif ($tex->{'type'}==1) { # word if ($status>0) { inc_count($tex,$status); set_style($tex,'word'.$status); } } elsif ($next eq '{') { # {...} parse_unit($tex,$status,'}'); } elsif ($tex->{'type'}==3 && $status==-3) { set_style($tex,'ignore'); } elsif ($tex->{'type'}==3) { # macro call parse_macro($tex,$next,$status,$substat); } elsif ($next eq '$') { # math inline parse_math($tex,$status,6,'$'); } elsif ($next eq '$$') { # math display (unless already in inlined math) if (!(defined $end && $end eq '$')) { parse_math($tex,$status,7,'$$'); } } if (!defined $end) {return;} } } sub parse_macro { my ($tex,$next,$status,$substat)=@_; if (my $label=$BreakPoints{$next}) { if ($tex->{'line'}=~ /^[*]?(\s*\[.*?\])*\s*\{(.+?)\}/ ) { $label=$label.': '.$2; } add_subcount($tex,$label); } set_style($tex,$status>0?'command':'exclcommand'); if ($next eq '\begin' && $status!=-2) { parse_begin_end($tex,$status); } elsif (($status==-1) && ($substat=$TeXfloatinc{$next})) { # text included from float set_style($tex,'command'); gobble_macro_parms($tex,$substat); } elsif ($status==-9 && defined ($substat=$TeXpreamble{$next})) { # parse preamble include macros set_style($tex,'command'); if (defined $TeXheader{$next}) {inc_count($tex,4);} gobble_macro_parms($tex,$substat,1); } elsif ($status<0) { # ignore gobble_option($tex); } elsif ($next eq '\(') { # math inline parse_math($tex,$status,6,'\)'); } elsif ($next eq '\[') { # math display parse_math($tex,$status,7,'\]'); } elsif ($next eq '\def') { # ignore \def... $tex->{'line'} =~ s/^([^\{]*)\{/\{/; flush_next($tex); print_style($1.' ','ignore'); parse_unit($tex,-2); } elsif (defined (my $addsuffix=$TeXfileinclude{$next})) { # include file: queue up for parsing parse_include_file($tex,$status,$addsuffix); } elsif (defined ($substat=$TeXmacro{$next})) { # macro: exclude options if (defined $TeXheader{$next}) {inc_count($tex,4);} gobble_macro_parms($tex,$substat,$status); } elsif (defined ($substat=$TeXmacroword{$next})) { # count macro as word (or a given number of words) inc_count($tex,$status,$substat); set_style($tex,'word'.$status); } elsif ($next =~ /^\\[^\w\_]/) { } else { gobble_option($tex); } } sub parse_tc { my ($tex)=@_; my $next=$tex->{'next'}; set_style($tex,'tc'); flush_next($tex); if (!($next=~s/^\%+TC:\s*(\w+)\s*// )) { print_error('Warning: TC command should have format %TC:instruction [macro] [parameters]'); return; }; my $instr=$1; if ($instr=~/^(break)$/) { if ($instr eq 'break') {add_subcount($tex,$next);} } elsif ($next=~/^([\\]*\w+)\s+([^\s\n]+)(\s+([0-9]+))?/) { # Format = TC:word macro my $macro=$1; my $param=$2; my $option=$4; if ($param=~/^\[([0-9,]+)\]$/) {$param=[split(',',$1)];} if (($instr eq 'macro') || ($instr eq 'exclude')) {$TeXmacro{$macro}=$param;} elsif ($instr eq 'header') {$TeXheader{$macro}=$param;$TeXmacro{$macro}=$param;} elsif ($instr eq 'macroword') {$TeXmacroword{$macro}=$param;} elsif ($instr eq 'preambleinclude') {$TeXpreamble{$macro}=$param;} elsif ($instr eq 'group') { $TeXmacro{'begin'.$macro}=$param; $TeXgroup{$macro}=$option; } elsif ($instr eq 'floatinclude') {$TeXfloatinc{$macro}=$param;} elsif ($instr eq 'fileinclude') {$TeXfileinclude{$macro}=$param;} elsif ($instr eq 'breakmacro') {$BreakPoints{$macro}=$param;} else {print_error("Warning: Unknown TC command: ".$instr);} } elsif ($instr eq 'ignore') { tc_ignore_input($tex); } else { print_error("Warning: Invalid TC command format: ".$instr); } } sub tc_ignore_input { my ($tex)=@_; set_style($tex,'ignore'); parse_unit($tex,-3,"%TC:endignore"); set_style($tex,'tc'); flush_next($tex); } sub parse_math { my ($tex,$status,$substat,$end)=@_; my $localstyle=$status>0 ? 'mathgroup' : 'exclmath'; if ($status>0) {inc_count($tex,$substat);} set_style($tex,$localstyle); parse_unit($tex,0,$end); set_style($tex,$localstyle); } sub parse_begin_end { my ($tex,$status)=@_; my $localstyle=$status>0 ? 'grouping' : 'exclgroup'; flush_style($tex,$localstyle); gobble_option($tex); my $groupname; if ($tex->{'line'} =~ s/^\{\s*([^\{\}]+)\s*\*?\}[ \t\r\f]*//) { # gobble group type $groupname=$1; print_style('{'.$1.'}',$localstyle); my $next='begin'.$1; if (defined (my $substat=$TeXmacro{$next})) { gobble_macro_parms($tex,$substat); } } else { print_error("Warning: BEGIN group without type."); } # find group status (or leave unchanged) my $substat; defined ($substat=$TeXgroup{$1}) || ($substat=$status); if ($status<=0 && $status<$substat) {$substat=$status;} if (($status>0) && ($substat==-1)) { # Count float inc_count($tex,5); } if ($status>0 and $substat>3) { # count item, exclude contents inc_count($tex,$substat); $substat=0; } parse_unit($tex,$substat,'\end'); if ($tex->{'line'} =~ s/^\{\s*([^\{\}]+)\s*\}[ \t\r\f]*//) { # gobble group type flush_style($tex,$localstyle); print_style('{'.$1.'}',$localstyle); } else { print_error("Warning: END group without type while waiting to end ".$groupname."."); } } sub parse_include_file { my ($tex,$status,$addsuffix)=@_; $tex->{'line'} =~ s/^\{([^\{\}\s]+)\}// || $tex->{'line'} =~ s/^\s*([^\{\}\%\\\s]+)// || return; flush_next($tex); if ($status>0) { print_style($&,'fileinclude'); my $fname=$1; if ($addsuffix==2) {$fname.='.tex';} elsif ($addsuffix==1 && ($fname=~/^[^\.]+$/)) {$fname.='.tex';} if ($includeTeX) {add_file_to_list($fname);} } else { print_style($&,'ignored'); } } sub gobble_option { my $tex=shift @_; flush_next($tex); foreach my $pattern (@MacroOptionPatterns) { if ($tex->{'line'}=~s/^($pattern)//) { print_style($1,'option'); return $1; } } return undef; } sub gobble_options { while (gobble_option(@_)) {} } sub gobble_macro_modifier { my $tex=shift @_; flush_next($tex); if ($tex->{'line'} =~ s/^\*//) { print_style($1,'option'); return $1; } return undef; } sub gobble_macro_parms { my ($tex,$parm,$oldstat)=@_; my $i; if (ref($parm) eq 'ARRAY') { $i=scalar @{$parm}; } else { $i=$parm; $parm=[0,0,0,0,0,0,0,0,0]; } if ($i>0) {gobble_macro_modifier($tex);} gobble_options($tex); for (my $j=0;$j<$i;$j++) { parse_unit($tex,new_status($parm->[$j],$oldstat)); gobble_options($tex); } } sub new_status { my ($substat,$old)=@_; if (!defined $old) {return $substat;} if ($old==-3 || $substat==-3) {return -3;} if ($old==-2 || $substat==-2) {return -2;} if ($old==0 || $substat==0) {return 0;} if ($old==-9 || $substat==-9) {return -9;} if ($old>$substat) {return $old;} return $substat; } sub next_token { my $tex=shift @_; my ($next,$type); if (defined $tex->{'next'}) {print_style($tex->{'next'}.' ',$tex->{'style'});} $tex->{'style'}=undef; while (defined ($next=get_next_token($tex))) { $type=$tex->{'type'}; if ($type==0) { print_style($next,'comment'); } elsif ($type==9) { if ($verbose>0) {line_return(1,$tex);} } else { return $next; } } return $next; } sub get_next_token { # Token (or token group) category: # 0: comment # 1: word (or other forms of text or text components) # 2: symbol (not word, e.g. punctuation) # 3: macro # 4: curly braces {} # 5: brackets [] # 6: maths # 9: line break in file # 999: end of line or blank line # 666: TeXcount instruction (%TC:instruction) my $tex=shift @_; my $next; (defined ($next=get_token($tex,'\%+TC:\s*endignore\b[^\r\n]*',666))) && return "%TC:endignore"; (defined ($next=get_token($tex,'\%+TC:[^\r\n]*',666))) && return $next; (defined ($next=get_token($tex,'\%[^\r\n]*',0))) && return $next; (defined ($next=get_token($tex,'(\r|\n|\r\n)',9))) && return $next; (defined ($next=get_token($tex,'\\\\[\{\}]',2))) && return $next; foreach my $pattern (@WordPatterns) { (defined ($next=get_token($tex,$pattern,1))) && return $next; } (defined ($next=get_token($tex,'[\"\'\`:\.,\(\)\[\]!\+\-\*=/\^\_\@\<\>\~\#\&]',2))) && return $next; (defined ($next=get_token($tex,'\\\\([a-zA-Z_]+|[^a-zA-Z_])',3))) && return $next; (defined ($next=get_token($tex,'[\{\}]',4))) && return $next; (defined ($next=get_token($tex,'[\[\]]',5))) && return $next; (defined ($next=get_token($tex,'\$\$',6))) && return $next; (defined ($next=get_token($tex,'\$',6))) && return $next; (defined ($next=get_token($tex,'.',999))) && return $next; (defined ($next=get_token($tex,'[^\s]+',999))) && return $next; $tex->{'eof'}=1; return undef; } sub get_token { my ($tex,$regexp,$type)=@_; if (!defined $regexp) {print_error("ERROR in get_token: undefined regex.");} if (!defined $tex->{'line'}) {print_error("ERROR in get_token: undefined tex-line. ".$tex->{'next'});} if ( $tex->{'line'} =~ s/^($regexp)[ \t\r\f]*// ) { $tex->{'next'}=$1; $tex->{'type'}=$type; return $1; } return undef; } ###### Count handling routines sub new_count { my ($title)=@_; my @cnt=(0,0,0,0,0,0,0,0); my %count=('count'=>\@cnt,'title'=>$title); # files, text words, header words, float words, # headers, floats, math-inline, math-display; return \%count; } sub inc_count { my ($tex,$type,$value)=@_; my $count=$tex->{'count'}; if (!defined $value) {$value=1;} ${$count->{'count'}}[$type]+=$value; } sub get_count { my ($count,$type)=@_; return ${$count->{'count'}}[$type]; } sub total_count { my ($count)=@_; my $sum=0; for (my $i=scalar(@sumweights);$i-->0;) { $sum+=get_count($count,$i+1)*$sumweights[$i]; } return $sum; } sub print_count { my ($count,$header)=@_; if ($briefsum && @sumweights) { print_count_total($count,$header); } elsif ($briefsum) { if ($htmlstyle) {print "

";} print_count_brief($count,$header); if ($htmlstyle) {print "

\n";} } else { print_count_details($count,$header); } } sub print_count_with_header { my ($count,$header)=@_; if (!defined $header) {$header=$count->{'title'};} if (!defined $header) {$header="";} return $count,$header; } sub print_count_total { my ($count,$header)=print_count_with_header(@_); if ($htmlstyle) {print "

".$header;} print total_count($count); if ($htmlstyle) {print "

\n";} else {print ": ".$header;} } sub print_count_brief { my ($count,$header)=print_count_with_header(@_); my $cnt=$count->{'count'}; print ${$cnt}[1]."+".${$cnt}[2]."+".${$cnt}[3]. " (".${$cnt}[4]."/".${$cnt}[5]."/".${$cnt}[6]."/".${$cnt}[7].") ". $header; } sub print_count_details { my ($count,$header)=print_count_with_header(@_); if ($htmlstyle) {print "
\n";} if (defined $header) { formatprint($header."\n",'dt','header'); } if (get_count($count,0)>1) { formatprint($countlabel[0].': ','dt'); formatprint(get_count($count,0)."\n",'dd'); } if (@sumweights) { formatprint('Sum count: ','dt'); formatprint(total_count($count)."\n",'dd'); } for (my $i=1;$i<8;$i++) { formatprint($countlabel[$i].': ','dt'); formatprint(get_count($count,$i)."\n",'dd'); } my $subcounts=$count->{'subcounts'}; if ($showsubcounts && defined $subcounts && scalar(@{$subcounts})>1) { formatprint("Subcounts: text+headers+captions (#headers/#floats/#inlines/#displayed)\n",'dt'); foreach my $subcount (@{$subcounts}) { if ($htmlstyle) {print "
";} print_count_brief($subcount); if ($htmlstyle) {print "
";} print "\n"; } } if ($htmlstyle) {print "
\n";} } sub add_count { my ($a,$b)=@_; for (my $i=0;$i<8;$i++) { ${$a->{'count'}}[$i]+=${$b->{'count'}}[$i]; } } sub add_subcount { my ($tex,$title)=@_; add_count($tex->{'countsum'},$tex->{'count'}); push @{$tex->{'countlist'}},$tex->{'count'}; $tex->{'count'}=new_count($title); return $tex->{'countsum'}; } ###### Printing routines sub set_style { my ($tex,$style)=@_; if (!(($tex->{'style'}) && ($tex->{'style'} eq '-'))) {$tex->{'style'}=$style;} } sub flush_style { my ($tex,$style)=@_; set_style($tex,$style); flush_next($tex); } sub line_return { my ($blank,$tex)=@_; if ($blank>$blankline) { if ((defined $tex) && @sumweights) { my $num=total_count($tex->{'count'}); print_style(" [".$num."]","sumcount"); } linebreak(); $blankline++; } } sub linebreak { if ($htmlstyle) {print "
\n";} else {print "\n";} } sub print_style { my ($text,$style,$state)=@_; (($verbose>=0) && (defined $text) && (defined $style)) || return 0; my $colour; ($colour=$STYLE{$style}) || return; if (($colour) && !($colour eq '-')) { print_with_style($text,$style,$colour); if ($state) { print_style($state,'state'); } $blankline=-1; return 1; } else { return 0; } } sub print_error { my $text=shift @_; $errorcount++; if ($verbose>=0) { line_return(1); print_style("!!! ".$text." !!!",'error'); line_return(1); } } sub formatprint { my ($text,$tag,$class)=@_; my $break=($text=~s/\n$//); if ($htmlstyle && defined $tag) { print '<'.$tag; if ($class) {print " class='".$class."'";} print '>'.$text.''; } else { print $text; } if ($break) {print "\n";} } sub flush_next { my $tex=shift @_; if (defined $tex->{'next'}) { print_style($tex->{'next'}.' ',$tex->{'style'},$tex->{'printstate'}); } $tex->{'printstate'}=undef; $tex->{'style'}='-'; } # Close the output, e.g. adding HTML tail sub Close_Output { if ($htmlstyle>1) { html_tail(); } } # Report if there were any errors occurring during parsing sub Report_ErrorCount { if ($errorcount==0) {return;} if ($briefsum && $totalflag) {print " ";} if ($htmlstyle) { print_error("Errors:".$errorcount,"p","error"); } else { print "(errors:".$errorcount.")"; } } sub print_help_style { if ($verbose<=0) {return;} formatprint("Format/colour codes of verbose output:","h2"); print "\n\n"; if ($htmlstyle) {print "

";} help_style_line('Text which is counted',"word1","counted as text words"); help_style_line('Header and title text',"word2","counted as header words"); help_style_line('Caption text and footnotes',"word3","counted as caption words"); help_style_line("Ignored text or code","ignore","excluded or ignored"); help_style_line('\documentclass',"document","document start, beginning of preamble"); help_style_line('\macro',"command","macro not counted, but parameters may be"); help_style_line('\macro',"exclcommand","macro in excluded region"); help_style_line("[Macro options]","option","not counted"); help_style_line('\begin{group} \end{group}',"grouping","begin/end group"); help_style_line('\begin{group} \end{group}',"exclgroup","begin/end group in excluded region"); help_style_line('$ $',"mathgroup","counted as one equation"); help_style_line('$ $',"exclmath","equation in excluded region"); help_style_line('% Comments',"comment","not counted"); help_style_line('%TC:TeXcount instructions',"tc","not counted"); help_style_line("File to include","fileinclude","not counted but file may be counted later"); if ($showstates) { help_style_line('[state]',"state","internal TeXcount state"); } if (@sumweights) { help_style_line('[sumcount]',"sumcount","cumulative sum count"); } help_style_line("ERROR","error","TeXcount error message"); if ($htmlstyle) {print "

";} print "\n\n"; } sub help_style_line { my ($text,$style,$comment)=@_; if ($htmlstyle) { $comment="  ....  ".$comment; } else { $comment=" .... ".$comment; } if (print_style($text,$style)) { print $comment; linebreak(); } } # Print output style codes if conditions are met sub conditional_print_help_style { if ($showcodes) {print_help_style();} return $showcodes; } ###### HTML routines sub html_head { print "\n"; if ($utf8flag) { print "\n"; } print '

LaTeX word count

'; } sub html_tail { print ''; } ###### Help routines sub print_version { print "TeXcount version ".$versionnumber.", ".$versiondate.'.'; } sub print_syntax { print ' Syntax: TeXcount.pl [options] files Options: -relaxed Uses relaxed rules for word and option handling: i.e. allows more general cases to be counted as either words or macros. -v Verbose (same as -v3) -v0 Do not present parsing details -v1 Verbose: print parsed words, mark formulae -v2 More verbose: also print ignored text -v3 Even more verbose: include comments and options -v4 Same as -v3 -showstate -showstate Show internal states (with verbose) -brief Only prints a brief, one line summary of counts -q, -quiet Quiet mode, no error messages (use is discouraged!) -sum, -sum= Make sum of all word and equation counts. May also use -sum=#[,#] with up to 7 numbers to indicate how each of the counts (text words, header words, caption words, #headers, #floats, #inlined formulae, #displayed formulae) are summed. The default sum (if only -sum is used) is the same as -sum=1,1,1,0,0,1,1. -sub, -sub= Generate subcounts. Option values are none, part, chapter, section or subsection. Default (-sub) is set to subsection, whereas unset is none. (Alternative option name is -subcount.) -nc, -nocol No colours (colours require ANSI) -html Output in HTML format -htmlcore Only HTML body contents -inc Include tex files included in the document -noinc Do not include included tex files (default) -total Do not give sums per file, only total sum. -1 Same as -brief and -total. Ensures there is only one line of output. If used in conjunction with -sum, the output will only be the total number. (NB: Character is the number one, not the letter L.) -dir, -dir= Specify the working directory using -dir=path. Remember that the path must end with \ or /. If only -dir is used, the directory of the parent file is used. -utf8, -unicode Turns on Unicode (UTF-8) for input and output. This is automatic with -chinese, and is required to handle e.g. Korean text. Note that the TeX file must be save in UTF-8 format (not e.g. GB2312 or Big5), or the result will be unpredictable. -ch, -chinese, -zhongwen Turns on support for Chinese characters. TeXcount will then count each Chinese character as a word. Automatically turns on -utf8. -jp, -japanese Turns on support for Japanese characters. TeXcount will count each Japanese character (kanji, hiragana, and katakana) as one word, i.e. not do any form of word segmentation. Automatically turns on -utf8. -codes Display output style code overview and explanation. This is on by default. -nocodes Do not display output style code overview. -h, -?, --help, /? Help --version Print version number --license License information '; } sub print_help { print ' *************************************************************** * TeXcount.pl '.$versionnumber.', '.$versiondate.' * Count words in TeX and LaTeX files, ignoring macros, tables, formulae, etc. '; print_syntax(); print ' The script counts words as either words in the text, words in headers/titles or words in floats (figure/table captions). Macro options (i.e. \marco[...]) are ignored; macro parameters (i.e. \macro{...}) are counted or ignored depending on the macro, but by default counted. Begin-end groups are by default ignored and treated as \'floats\', though some (e.g. center) are counted. Unless -nocol (or -nc) has been specified, the output will be colour coded. Counted text is coloured blue with headers are in bold and in HTML output caption text is italicised. Mathematical formulae are not counted as words, but are instead counted separately with separate counts for inlined formulae and displayed formulae. Similarly, the number of headers and the number of \'floats\' are counted. Note that \'float\' is used here to describe anything defined in a begin-end group unless explicitly recognized as text or mathematics. The verbose options (-v1, -v2, -v3, showstate) produces output indicating how the text has been interpreted. Check this to ensure that words in the text has been interpreted as such, whereas mathematical formulae and text/non-text in begin-end groups have been correctly interpreted. Parsing instructions may be passed to TeXcount using comments in the LaTeX files on the format %TC:instruction arguments where valid instructions for setting parsing rules, typically set at the start of the document (applies globally), are: %TC:macro [macro] [param.states] macro handling rule, no. of and rules for parameters %TC:macroword [macro] [number] macro counted as a given number of words %TC:header [macro] [param.states] header macro rule, as macro but counts as one header %TC:breakmacro [macro] [label] macro causing subcount break point %TC:group [name] [parsing-state] begin-end-group handling rule %TC:floatinclude [macro] [param.states] as macro, but also counted inside floats %TC:preambleinclude [macro] [param.states] as macro, but also counted inside the preamble %TC:fileinclue [macro] [rule] file include, add .tex if rule=2, not if rule=0 The [param.states] is used to indicate the number of parameters used by the macro and the rules of handling each of these: format is [#,#,...,#] with one number for each parameter, and main rules are 0 to ignore and 1 to count as text. Parsing instructions which may be used anywhere are: %TC:ignore start block to ignore %TC:endignore end block to ignore %TC:break [title] add subcount break point here See the documentation for more details. Unix hint: Use \'less -r\' instead of just \'less\' to view output: the \'-r\' option makes less treat text formating codes properly. Windows hint: If your Windows interprets ANSI colour codes, lucky you! Otherwise, use the -nocol (or -nc) option with the verbose options or the output will be riddled with colour codes. Instead, you can use -html to produce HTML code, write this to file and view with your favourite browser. '; print_reference(); } sub print_reference { print ' The TeXcount script is copyright of Einar Andreas Rødland (2008) and published under the LaTeX Project Public License. For more information about the script, e.g. news, updates, help, usage tips, known issues and short-comings, go to http://folk.uio.no/einarro/Comp/texwordcount.html or go to http://folk.uio.no/einarro/Services/texcount.html to access the script as a web service. Feedback such as problems or errors can be reported to einarro@ifi.uio.no. '; } sub print_license { print 'TeXcount version '.$versionnumber.' Copyright 2008 Einar Andreas Rødland The TeXcount script is published under the LaTeX Project Public License (LPPL) http://www.latex-project.org/lppl.txt which grants you, the user, the right to use, modify and distribute the script. However, if the script is modified, you must change its name or use other technical means to avoid confusion. The script has LPPL status "maintained" with Einar Andreas Rødland being the current maintainer. '; }