Contributor: MATT BOUSEK
(*
Here is TALLY.PAS, a program that Matt Bousek wrote
to do a word frequency analysis on a text file. It uses an AVL tree. It
should compile under TP 6.0 or BP 7.0
*)
program word_freq(input,output);
type
short_str = string[32];
{************AVLtree routines*********}
type
balance_set = (left_tilt,neutral,right_tilt);
memptr = ^memrec;
memrec = record
balance : balance_set;
left,right : memptr;
count : longint;
key : short_str;
end;
{**************************************}
procedure rotate_right(var root:memptr);
var ptr2,ptr3 : memptr;
begin
ptr2:=root^.right;
if ptr2^.balance=right_tilt then begin
root^.right:=ptr2^.left;
ptr2^.left:=root;
root^.balance:=neutral;
root:=ptr2;
end else begin
ptr3:=ptr2^.left;
ptr2^.left:=ptr3^.right;
ptr3^.right:=ptr2;
root^.right:=ptr3^.left;
ptr3^.left:=root;
if ptr3^.balance=left_tilt
then ptr2^.balance:=right_tilt
else ptr2^.balance:=neutral;
if ptr3^.balance=right_tilt
then root^.balance:=left_tilt
else root^.balance:=neutral;
root:=ptr3;
end;
root^.balance:=neutral;
end;
{*************************************}
procedure rotate_left(var root:memptr);
var ptr2,ptr3 : memptr;
begin
ptr2:=root^.left;
if ptr2^.balance=left_tilt then begin
root^.left:=ptr2^.right;
ptr2^.right:=root;
root^.balance:=neutral;
root:=ptr2;
end else begin
ptr3:=ptr2^.right;
ptr2^.right:=ptr3^.left;
ptr3^.left:=ptr2;
root^.left:=ptr3^.right;
ptr3^.right:=root;
if ptr3^.balance=right_tilt
then ptr2^.balance:=left_tilt
else ptr2^.balance:=neutral;
if ptr3^.balance=left_tilt
then root^.balance:=right_tilt
else root^.balance:=neutral;
root:=ptr3;
end;
root^.balance:=neutral;
end;
{*****************************************************************}
procedure insert_mem(var root:memptr; x:short_str; var ok:boolean);
begin
if root=nil then begin
new(root);
with root^ do begin
key:=x;
left:=nil;
right:=nil;
balance:=neutral;
count:=1;
end;
ok:=true;
end else begin
if x=root^.key then begin
ok:=false;
inc(root^.count);
end else begin
if xnil then begin
dump_mem(root^.left);
writeln(root^.count:5,' ',root^.key);
dump_mem(root^.right);
end;
end;
{MAIN***************************************************************}
{*** This program was written by Matt Bousek sometime in 1992. ***}
{*** The act of this posting over Internet makes the code public ***}
{*** domain, but it would be nice to keep this header. ***}
{*** The basic AVL routines came from a book called "Turbo Algo- ***}
{*** rythms", Sorry, I don't have the book here and I can't ***}
{*** remember the authors or publisher. Enjoy. And remember, ***}
{*** there is no free lunch... ***}
const
wchars:set of char=['''','a'..'z'];
var
i,j : word;
aword : short_str;
subject : text;
wstart,wend : word;
inword : boolean;
linecount : longint;
wordcount : longint;
buffer : array[1..10240] of char;
line : string;
filename : string;
tree : memptr;
BEGIN
tree:=nil;
filename:=paramstr(1);
if filename='' then filename:='tally.pas';
assign(subject,filename);
settextbuf(subject,buffer);
reset(subject);
wordcount:=0;
linecount:=0;
while not eof(subject) do begin
inc(linecount);
readln(subject,line);
wstart:=0; wend:=0;
for i:=1 to byte(line[0]) do begin
if line[i] in ['A'..'Z'] then line[i]:=chr(ord(line[i])+32);
inword:=(line[i] in wchars);
if inword and (wstart=0) then wstart:=i;
if inword and (wstart>0) then wend:=i;
if not(inword) or (i=byte(line[0])) then begin
if wend>wstart then begin
aword:=copy(line,wstart,wend+1-wstart);
j:=byte(aword[0]);
if (aword[j]='''') and (j>2) then begin {lose trailing '}
aword:=copy(aword,1,j-1);
dec(wend);
dec(j);
end;
if (aword[1]='''') and (j>2) then begin {lose leading '}
aword:=copy(aword,2,j-1);
inc(wstart);
dec(j);
end;
if (j>2) and (aword[j-1]='''') and (aword[j]='s') then
begin {lose trailing 's}
aword:=copy(aword,1,j-2);
dec(wend,2);
dec(j,2);
end;
if (j>2) then begin
inc(wordcount);
insert_memtree(tree,aword);
end;
end; { **if wend>wstart** }
wstart:=0; wend:=0;
end; { **if not(inword)** }
end; { **for byte(line[0])** }
end; { **while not eof** }
dump_mem(tree);
writeln(linecount,' lines, ',wordcount,' words.');
END.