# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by Ti Lian Hwang <tilh@mars> on Thu Jul  6 15:14:10 1995
#
# This archive contains:
#	dummy.pl	h2a_pcl.pl	parse-html.pl	html-ascii.pl	
#

LANG=""; export LANG
PATH=/bin:/usr/bin:$PATH; export PATH

echo x - dummy.pl
cat >dummy.pl <<'@EOF'
# put your drivers in a file
# 'h2a_$TERM.pl' where $TERM is the environment variable
#
sub html_begin_doc {}

sub html_end_doc {}

sub begin_font {}

sub end_font {}

1;
@EOF

chmod 664 dummy.pl

echo x - h2a_pcl.pl
cat >h2a_pcl.pl <<'@EOF'
#
# HP-PCL font functions to be used by html-to-ascii.pl
# "required" by parse-html"
#
# Author : Ti Lian Hwang
# email  : tilh@sin-ro.sin-ro.dhl.com
#
#
# fixed spacing, 12 pitch, 12 points, Times Roman
$normal_font = "\033(s0p12h12v3T";

sub html_begin_doc {
	# reset, CR=CR,LF=LF+CR,FF=FF+CR
	print "\033E\033&k2G";
}

sub html_end_doc {
	print "\033E";
}

$Begin{"B"} = "begin_font";
$End{"B"} = "end_font";      

$Begin{"I"} = "begin_font";
$End{"I"} = "end_font";      

$Begin{"U"} = "begin_font";
$End{"U"} = "end_font";      

$Begin{"EM"} = "begin_font";
$End{"EM"} = "end_font";      

%font_begin = (
	"PRE","\033(s0p12h3T",
	"H1","\033(s1p20v5t3b",
	"H2","\033(s1p18v5t3b",
	"H3","\033(s1p16v5t3B",
	"H4","\033(s1p14v5t3B",
	"H5","\033(s1p12v5t3B",
	"H6","\033(s1p12v5t3B",
	"B","\033(s3B",
	"U","\033&dD",
	"I","\033(s1S",
	"EM","\033(s3B"
);

%font_end = (
	"PRE",$normal_font,
	"H1",$normal_font . "\033(s0B",
	"H2",$normal_font . "\033(s0B",
	"H3",$normal_font . "\033(s0B",
	"H4",$normal_font . "\033(s0B",
	"H5",$normal_font . "\033(s0B",
	"H6",$normal_font . "\033(s0B",
	"B","\033(s0B",
	"U","\033&d@",
	"I","\033(s0S",
	"EM","\033(s0B"
);

sub begin_font {
local ($element, $tag) = @_;
print $font_begin{$element};
}

sub end_font {
local ($element, $tag) = @_;
print $font_end{$element};
}

1;
@EOF

chmod 664 h2a_pcl.pl

echo x - parse-html.pl
sed 's/^@//' >parse-html.pl <<'@EOF'
# HTML parser
# Jim Davis, July 15 1994

# This is an HTML parser not an SGML parser.  It does not parse a DTD,
# The DTD is implicit in the code, and specific to HTML.  
# The processing of the HTML can be customized by the user by
#   1) Defining routines to be called for various tags (see Begin and End arrays)
#   2) Defining routines html_content and html_whitespace

# This is not a validating parser.   It does not check the content model
# eg you can use DT outside a DL and it won't know.  It is too liberal in
# what tags are allowed to minimize what other tags.

# Bugs - can't parse the prolog or whatever you call it
#   <!DOCTYPE HTML [
#     <!entity % HTML.Minimal "INCLUDE">
#     <!-- Include standard HTML DTD -->
#     <!ENTITY % html PUBLIC "-//connolly hal.com//DTD WWW HTML 1.8//EN">
#     %html;
#     ]>

# modified 3 Aug to add a bunch of HTML 2.0 tags
# modified 3 Sept to print HTML stack to STDERR not STDOUT, to add new
#  routines html_begin_doc and html_end_doc for application specific cleanup
#  and to break parse_html into two pieces.
# modified 30 Sept 94.  parse_attributes now handles tag attributes that
#   don't have values.  thanks to  Bill Simpson-Young <bill@syd.dit.csiro.au>
#   for the code.
# modified 17 Apr 95 to support FORMS tags.
#
# modified 6 July 1995 by Ti Lian Hwang <tilh@sin-co.sin-ro.dhl.com>
#   to handle 'printer drivers' - files to be 'require' depending on $TERM
#   files are to have name of 'h2a_$TERM.pl'
#

$filename = "h2a_" . $ENV{"TERM"} . ".pl";
foreach $prefix (@INC) {                
    $realfilename = "$prefix/$filename";
    if (-f $realfilename) { 
	require "$filename" ;
	$filename = "";
	last;
    }
}
if ($filename) {require "dummy.pl" ; }

$debug = 0;

$whitespace_significant = 0;

# global variables: 
#  $line_buffer is line buffer
#  $line_count is input line number.

$line_buffer = "";
$line_count = 0;

sub parse_html {
    local ($file) = @_;
    open (HTML, $file) || die "Could not open $file: $!\nStopped";
    &parse_html_stream ();
    close (HTML);}

# Global input HTML is the handle to the stream of HTML
sub parse_html_stream {
    local ($token, $new);

    ## initialization
    @stack=();
    $line_count = 0;
    $line_buffer = "";

    ## application specific initialization
    &html_begin_doc();

  main:
    while (1) {

	# if whitespace does not matter, trim any leading space.
	if (! $whitespace_significant) {
	    $line_buffer =~ s/^\s+//;}

	# now dispatch on the type of token

	if ($line_buffer =~ /^(\s+)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_whitespace ($token);}

	# This will lose if there is more than one comment on the line!
	elsif ($line_buffer =~ /^(\<!--.*-->)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_comment ($token);}

	elsif ($line_buffer =~ /^(\<![^-][^\>]*\>)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_comment ($token);}

	elsif ($line_buffer =~ /^(\<\/[^\>]*\>)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_etag ($token);}

	elsif ($line_buffer =~ /^(\<[^!\/][^\>]*\>)/) {
	    $token = $1;
	    $line_buffer = $';
	    &html_tag ($token);}

	elsif ($line_buffer =~ /^([^\s<]+)/) {
	    $token = $1;
	    $line_buffer = $';
	    $token = &substitute_entities($token);
	    &html_content ($token); }

	else {
	    # No valid token in buffer.  Maybe it's empty, or maybe there's an
	    # incomplete tag.  So get some more data.
	    $new = <HTML>;
	    if (! defined ($new)) {last main;}
	    # if we're trying to find a match for a tag, then get rid of embedded newline
	    # this is, I think, a kludge
	    if ($line_buffer =~ /^\</ && $line_buffer =~ /\n$/) {
		chop $line_buffer;
		$line_buffer .= " ";}
	    $line_buffer .= $new;
	    $line_count++;}
    }

    ## cleanup
    &html_end_doc();

    if ($#stack > -1) {
	print STDERR "Stack not empty at end of document\n";
	&print_html_stack();}
}


sub html_tag {
    local ($tag) = @_;
    local ($element) = &tag_element ($tag);
    local (%attributes) = &tag_attributes ($tag);

    # the tag might minimize (be an implicit end) for the previous tag
    local ($prev_element);
    while (&Minimizes(&stack_top_element(), $element)) {
	$prev_element = &stack_pop_element ();
	if ($debug)  {
	    print STDERR "MINIMIZING $prev_element with $element on $line_count\n";}
	&html_end ($prev_element, 0);}

    push (@stack, $tag);

    &html_begin ($element, $tag, *attributes);

    if (&Empty($element)) {
	pop(@stack);
	&html_end ($element, 0);}
}

sub html_etag {
    local ($tag) = @_;
    local ($element) = &tag_element ($tag);

    # pop stack until find matching tag.  This is probably a bad idea,
    # or at least too general.
    local ( $prev_element) = &stack_pop_element();
    until ($prev_element eq $element) {
	if ($debug) {
	    print STDERR "MINIMIZING $prev_element with /$element on $line_count \n";}
	&html_end ($prev_element, 0);

	if ($#stack == -1) {
	    print STDERR "No match found for /$element.  You will lose\n";
	    last;}
	$prev_element = &stack_pop_element();}

    &html_end ($element, 1);
}


# For each element, the names of elements which minimize it.
# This is of course totally HTML dependent and probably I have it wrong too
$Minimize{"DT"} = "DT:DD";
$Minimize{"DD"} = "DT";
$Minimize{"LI"} = "LI";
$Minimize{"P"} = "P:DT:LI:H1:H2:H3:H4:BLOCKQUOTE:UL:OL:DL";

# Does element E2 minimize E1?
sub Minimizes {
    local ($e1, $e2) = @_;
    local ($value) = 0;
    foreach $elt (split (":", $Minimize{$e1})) {
	if ($elt eq $e2) {$value = 1;}}
    $value;}

$Empty{"BASE"} = 1;
$Empty{"BR"} = 1;
$Empty{"HR"} = 1;
$Empty{"IMG"} = 1;
$Empty{"ISINDEX"} = 1;
$Empty{"LINK"} = 1;
$Empty{"META"} = 1;
$Empty{"NEXTID"} = 1;
$Empty{"INPUT"} = 1;

# Empty tags have no content and hence no end tags
sub Empty {
    local ($element) = @_;
    $Empty{$element};}


sub print_html_stack {
    print STDERR "\n  ==\n";
    foreach $elt (reverse @stack) {print STDERR "  $elt\n";}
    print STDERR "  ==========\n";}

# The element on top of stack, if any.
sub stack_top_element {
    if ($#stack >= 0) {	&tag_element ($stack[$#stack]);}}

sub stack_pop_element {
    &tag_element (pop (@stack));}

# The element from the tag, normalized.
sub tag_element {
    local ($tag) = @_;
    $tag =~ /<\/?([^\s>]+)/;
    local ($element) = $1;
    $element =~ tr/a-z/A-Z/;
    $element;}

# associative array of the attributes of a tag.
sub tag_attributes {
    local ($tag) = @_;
    $tag =~ /^<[A-Za-z]+ +(.*)>$/;
    &parse_attributes($1);}

# string should be something like
# KEY="value" KEY2="longer value"  KEY3="tags o doom"
# output is an associative array (like a lisp property list)
# attributes names are not case sensitive, do I downcase them
# Maybe (probably) I should substitute for entities when parsing attributes.

sub parse_attributes {
    local ($string) = @_;
    local (%attributes);
    local ($name, $val);
  get: while (1) {
      if ($string =~ /^ *([A-Za-z]+)=\"([^\"]*)\"/) {
	  $name = $1;
	  $val = $2;
	  $string = $';
	  $name =~ tr/A-Z/a-z/;
	  $attributes{$name} = $val; }
      elsif ($string =~ /^ *([A-Za-z]+)=(\S*)/) {
	  $name = $1;
	  $val = $2;
	  $string = $';
	  $name =~ tr/A-Z/a-z/;
	  $attributes{$name} = $val;}
      elsif ($string =~ /^ *([A-Za-z]+)/) {
	  $name = $1;
	  $val = "";
	  $string = $';
	  $name =~ tr/A-Z/a-z/;
	  $attributes{$name} = $val;}
      else {last;}}
    %attributes;}

sub substitute_entities {
    local ($string) = @_;
    $string =~ s/&amp;/&/g;
    $string =~ s/&lt;/</g;
    $string =~ s/&gt;/>/g;
    $string =~ s/&quot;/\"/g;
    $string;}


@@HTML_elements = (
		  "A",
		  "ADDRESS",
		  "B",
		  "BASE",
		  "BLINK",	#  Netscape addition :-(
		  "BLOCKQUOTE",
		  "BODY",
		  "BR",
		  "CITE",
		  "CENTER",	# Netscape addition :-(
		  "CODE",
		  "DD",
		  "DIR",
		  "DFN",
		  "DL",
		  "DT",
		  "EM",
		  "FORM",
		  "H1", "H2", "H3", "H4", "H5", "H6",
		  "HEAD",
		  "HR",
		  "HTML",
		  "I",
		  "ISINDEX",
		  "IMG",
		  "INPUT",
		  "KBD",
		  "LI",
		  "LINK",
		  "MENU",
		  "META",
		  "NEXTID",
		  "OL",
		  "OPTION",
		  "P",
		  "PRE",
		  "SAMP",
		  "SELECT",
		  "STRIKE",
		  "STRONG",
		  "TITLE",
		  "TEXTAREA",
		  "TT",
		  "U",
		  "UL",
		  "VAR",
		  );

sub define_element {
    local ($element) = @_;
    $Begin{$element} = "Noop";
    $End{$element} = "Noop";}

foreach $element (@HTML_elements) {&define_element($element);}

# do nothing
sub Noop {
    local ($element, $xxx) = @_;}

# called when a tag begins.  Dispatches using Begin
sub html_begin {
    local ($element, $tag, *attributes) = @_;

    local ($routine) = $Begin{$element};
    if ($routine eq "") {
	print STDERR "Unknown HTML element $element ($tag) on line $line_count\n";}
    else {
	&begin_font ($element, $explicit);
	eval "&$routine;"}
}

# called when a tag ends.  Explicit is 0 if tag end is because of minimization
# not that you should care.
sub html_end {
    local ($element, $explicit) = @_;
    local ($routine) = $End{$element};
    if ($routine eq "") {
	print STDERR "Unknown HTML element \"$element\" (END $explicit) on line $line_count\n";}
    else {
	eval "&$routine(\"$element\", $explicit)";
	&end_font ($element, $explicit);
    }
}

sub html_content {
    local ($word) = @_;
}

sub html_whitespace {
    local ($whitespace) = @_;}

sub html_comment {
    local ($tag) = @_;}

# redefine these for application-specific initialization and cleanup

# sub html_begin_doc {}

# sub html_end_doc {}

# return a "true value" when loaded by perl.
1;

@EOF

chmod 644 parse-html.pl

echo x - html-ascii.pl
sed 's/^@//' >html-ascii.pl <<'@EOF'
# Routines for HTML to ASCII.
# (fixed width font, no font changes for size, bold, etc) with a little

# BUGS AND MISSING FEATURES
#  font tags (e.g. CODE, EM) cause an extra whitespace 
#   e.g. <TT>foo</TT>, -> foo ,

# Jim Davis July 15 1994
# modified 3 Aug 94 to support MENU and DIR

require "tformat.pl" || die "Could not load tformat.pl: $@\nStopped";

# Can be set by command line arg
if (! defined($columns_per_line)) {
    $columns_per_line = 72;}

if (! defined($flush_last_page)) {
    $flush_last_page = 1;}

# amount to add to indentation
$indent_left = 5;
$indent_right = 5;

# ignore contents inside HEAD.
$ignore_text = 0;

# Set variables in tformat
$left_margin = 1;
$right_margin = $columns_per_line;
$bottom_margin = 0;

## Routines called by html.pl
$Begin{"HEAD"} = "begin_head";
$End{"HEAD"} = "end_head";

sub begin_head {
    local ($element, $tag) = @_;
    $ignore_text = 1;}

sub end_head {
    local ($element) = @_;
    $ignore_text = 0;}

$Begin{"BODY"} = "begin_document";

sub begin_document {
    local ($element, $tag) = @_;
    &start_page();}

$End{"BODY"} = "end_document";

sub end_document {
    local ($element) = @_;
    &fresh_line();}

## Headers

$Begin{"H1"} = "begin_header";
$End{"H1"} = "end_header";

$Begin{"H2"} = "begin_header";
$End{"H2"} = "end_header";

$Begin{"H3"} = "begin_header";
$End{"H3"} = "end_header";

$Begin{"H4"} = "begin_header";
$End{"H4"} = "end_header";

$Skip_Before{"H1"} = 1;
$Skip_After{"H1"} = 1;

$Skip_Before{"H2"} = 1;
$Skip_After{"H2"} = 1;

$Skip_Before{"H3"} = 1;
$Skip_After{"H3"} = 0;

sub begin_header {
    local ($element, $tag) = @_;
    &skip_n_lines ($Skip_Before{$element}, 5);}

sub end_header {
    local ($element) = @_;
    &skip_n_lines ($Skip_After{$element});}

$Begin{"BR"} = "line_break";

sub line_break {
    local ($element, $tag) = @_;
    &fresh_line();}

$Begin{"P"} = "begin_paragraph";

# if fewer than this many lines left on page, start new page
$widow_cutoff = 5;

sub begin_paragraph {
    local ($element, $tag) = @_;
    &skip_n_lines (1, $widow_cutoff);}

$Begin{"BLOCKQUOTE"} = "begin_blockquote";
$End{"BLOCKQUOTE"} = "end_blockquote";

sub begin_blockquote {
    local ($element, $tag) = @_;
    $left_margin += $indent_left;
    $right_margin = $columns_per_line - $indent_right;
    &skip_n_lines (1);}

sub end_blockquote {
    local ($element) = @_;
    $left_margin -= $indent_left;
    $right_margin = $columns_per_line;
    &skip_n_lines (1);}

$Begin{"PRE"} = "begin_pre";
$End{"PRE"} = "end_pre";

sub begin_pre {
    local ($element, $tag) = @_;
    $whitespace_significant = 1;}

sub end_pre {
    local ($element) = @_;
    $whitespace_significant = 0;}

$Begin{"INPUT"} = "form_input";

sub form_input {
    local ($element, $tag, *attributes) = @_;
    if ($attributes{"value"} ne "") {
	&print_word_wrap($attributes{"value"});}}

$Begin{"HR"} = "horizontal_rule";

sub horizontal_rule {
    local ($element, $tag) = @_;
    &fresh_line ();
    &print_n_chars ($right_margin - $left_margin, "-");}

# Add code for IMG (use ALT attribute)
# Ignore I, B, EM, TT, CODE (no font changes)

## List environments

$Begin{"UL"} = "begin_itemize";
$End{"UL"} = "end_list_env";

$Begin{"OL"} = "begin_enumerated";
$End{"OL"} = "end_list_env";

$Begin{"MENU"} = "begin_menu";
$End{"MENU"} = "end_list_env";

$Begin{"DIR"} = "begin_dir";
$End{"DIR"} = "end_list_env";

$Begin{"LI"} = "begin_list_item";

@@list_stack = ();
$list_type = "bullet";
$list_counter = 0;

sub push_list_env {
    push (@list_stack, join (":", $list_type, $list_counter));}

sub pop_list_env {
    ($list_type, $list_counter) = split (":", pop (@list_stack));
    $left_margin -= $indent_left;}

sub begin_itemize {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "bullet";
    $list_counter = "*";}

sub begin_menu {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "bullet";
    $list_counter = "*";}

sub begin_dir {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "bullet";
    $list_counter = "*";}

sub begin_enumerated {
    local ($element, $tag) = @_;
    &push_list_env();
    $left_margin += $indent_left;
    $list_type = "enumerated";
    $list_counter = 1;}

sub end_list_env {
    local ($element) = @_;
    &pop_list_env();
#    &fresh_line();
}

sub begin_list_item {
    local ($element, $tag) = @_;
    $left_margin -= 2;
    &fresh_line();
    &print_word_wrap("$list_counter ");
    if ($list_type eq "enumerated") {$list_counter++;}
    $left_margin += 2;}

$Begin{"DL"} = "begin_dl";

sub begin_dl {
    local ($element, $tag) = @_;
    &skip_n_lines(1,5);}
    
$Begin{"DT"} = "begin_defined_term";
$Begin{"DD"} = "begin_defined_definition";
$End{"DD"} = "end_defined_definition";

sub begin_defined_term {
    local ($element, $tag) = @_;
    &fresh_line();}

sub begin_defined_definition {
    local ($element, $tag) = @_;
    $left_margin += $indent_left;
    &fresh_line();}

sub end_defined_definition {
    local ($element) = @_;
    $left_margin -= $indent_left;
    &fresh_line();}

$Begin{"META"} = "begin_meta";

# a META tag sets a value in the assoc array %Variable
# i.e. <META name="author" content="Rushdie"> sers $Variable{author} to "Rushdie"
sub begin_meta {
    local ($element, $tag, *attributes) = @_;
    local ($variable, $value);
    $variable = $attributes{name};
    $value = $attributes{content};
    $Variable{$variable} = $value;}

$Begin{"IMG"} = "begin_img";

sub begin_img {
    local ($element, $tag, *attributes) = @_;
    &print_word_wrap (($attributes{"alt"} ne "") ? $attributes{"alt"} : "[IMAGE]");}

# Content and whitespace.

sub html_content {
    local ($string) = @_;
    unless ($ignore_text) {
	&print_word_wrap ($string);}}

sub html_whitespace {
    local ($string) = @_;
    if (! $whitespace_significant) {
	die "Internal error, called html_whitespace when whitespace was not significant";}
    local ($i);
    for ($i = 0; $i < length ($string); $i++) {
	&print_whitespace (substr($string,$i,1));}}

# called by tformat.  Do nothing.
sub do_footer {
}

sub do_header {
}


1;
@EOF

chmod 644 html-ascii.pl

exit 0