##---------------------------------------------------------------------------##
##  File:
##      dtd.pl
##  Author:
##      Earl Hood       ehood@convex.com
##  Description:
##      This file defines the "dtd" perl package.
##---------------------------------------------------------------------------##
##  Copyright (C) 1994  Earl Hood, ehood@convex.com
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
## 
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##  
##  You should have received a copy of the GNU General Public License
##  along with this program; if not, write to the Free Software
##  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##---------------------------------------------------------------------------##
##
##	The following main routines are defined in dtd:
##
##	Routine Name		-- Brief Description
##  -------------------------------------------------------------------------
##	DTDget_elem_attr	-- Get attributes for an element
##	DTDget_elements		-- Get array of all elements
##	DTDget_parents		-- Get parent elements of an element
##	DTDget_top_elements	-- Get top-most elements
##	DTDis_attr_keyword	-- Check for reserved attribute value
##	DTDis_elem_keyword	-- Check for reserved element value
##	DTDis_keyword		-- Check for attr/elem related reserved word
##	DTDprint_tree		-- Output content tree for an element
##	DTDread_dtd		-- Parse a SGML dtd
##	DTDread_mapfile		-- Parse entity map file
##	DTDreset		-- Reset all internal data for DTD
##  -------------------------------------------------------------------------
##	Note:  The above routines are defined to be part of package main.
##	       Therefore, one might have to qualify the routine if it
##	       is being called in another package besides main.
##
##
##	There exists other routines defined in package dtd that might
##	be useful besides the main ones defined.  See below for more
##	information.
##
##  Extra Notes:
##	The "dtd" package makes the following assumptions:
##
##	    o The reference concrete syntax is assumed.  However,
##	      various variables in dtd_vars.pl can be redefined to
##	      try to accomodate an alternate syntax.  User beware,
##	      though.  I do not guarantee that the program will still
##	      behave as normal with redefined variables.
##
##	    o The SGML DTD is syntactically correct.  This libary
##	      is not intended as a validator.  Use sgmls, or other
##	      SGML validator, for such purposes.
##
##	    o This package does not parse an SGML decleration
##	      statement.
##
##	    o Tag and entity names can only contain the characters
##	      "A-Za-z_.-".  However, this can be changed by setting
##	      the variable $namechars in dtd_vars.pl.  There is no
##	      size limit on name length.
##
##	    o Tag names are treated with case-insensitivity, but entity
##	      names are case-sensitive.  Tag names are converted and
##	      stored in lowercase.
##
##	    o The only public text language code recognized is "EN".
##	      Other can be added by changing the $pubtl variable in
##	      dtd_vars.pl.
##
##	    o When parsing a map file for entities, the public text
##	      language code is used to separate the entity identifier
##	      from its system filename.  If no public text language code
##	      is detected, the assumption is made that the entity
##	      identifier contains no whitespaces (this is done for
##	      mappings defined only for SYSTEM entity names).
##
##	    o When resolving external entities with a map file, a check
##	      is first done if there is a mapping for the entity
##	      identifier.  Then, it tries to see if there is a mapping
##	      for the entity name.
##
##	    o Multiple contiguous whitespaces are ignored in entity
##	      identifiers.  I.e. Multiple contiguous whitespaces are
##	      treated as one whitespace character.
##
##---------------------------------------------------------------------------##
##
##  Current status of package:
##
##	o General entities are ignored.
##
##	o Any declerations (w/o subsets) not recognized by program, are
##	  explicitly ignored.  However, they are parsed enough to
##	  recognize any in-line comments so the program does not
##	  prematurely end the declerations.
##
##	o <!DOCTYPE  is parsed, but external reference to file not
##	  implemented, yet.  Concurrent DTDs are not distinguished.
##
##	o INCLUDE and IGNORE marked sections are processed with
##	  nested marked sections allowed.  CDATA and RCDATA marked
##	  sections are not recognized and may cause incorrect
##	  behavior.  However, CDATA and RCDATA marked sections
##	  do not normally appear in a DTD.
##
##	  IGNORE has higher precedence than INCLUDE in case of
##	  nested sections.
##
##      o LINKTYPE, NOTATION, SHORTREF, USEMAP declerations are
##	  ignored.
##
##---------------------------------------------------------------------------##

package dtd;

##***************************************************************************##
##			       GLOBAL VARIABLES				     ##
##***************************************************************************##
##-------------------------##
## SGML key word variables ##
##-------------------------##
$ATTLIST	= "ATTLIST";
$CDATA		= "CDATA";
$COMMENT	= "--";
$CONREF		= "CONREF";
$CURRENT	= "CURRENT";
$DOCTYPE	= "DOCTYPE";
$ELEMENT	= "ELEMENT";
$EMPTY		= "EMPTY";
$ENDTAG		= "ENDTAG";
$ENTITY		= "ENTITY";
$ENTITIES	= "ENTITIES";
$FIXED		= "FIXED";
$ID		= "ID";
$IDREF		= "IDREF";
$IDREFS		= "IDREFS";
$IGNORE		= "IGNORE";
$IMPLIED	= "IMPLIED";
$INCLUDE	= "INCLUDE";
$LINK		= "LINK";
$LINKTYPE	= "LINKTYPE";
$MD		= "MD";
$MS		= "MS";
$NAME		= "NAME";
$NAMES		= "NAMES";
$NDATA		= "NDATA";
$NMTOKEN	= "NMTOKEN";
$NMTOKENS	= "NMTOKENS";
$NOTATION	= "NOTATION";
$NUMBER		= "NUMBER";
$NUMBERS	= "NUMBERS";
$NUTOKEN	= "NUTOKEN";
$NUTOKENS	= "NUTOKENS";
$PCDATA		= "PCDATA";
$PI		= "PI";
$PUBLIC		= "PUBLIC";
$RCDATA		= "RCDATA";
$REQUIRED	= "REQUIRED";
$SDATA		= "SDATA";
$SHORTREF	= "SHORTREF";
$SIMPLE		= "SIMPLE";
$STARTTAG	= "STARTTAG";
$SUBDOC		= "SUBDOC";
$SYSTEM		= "SYSTEM";
$TEMP		= "TEMP";
$TEXT		= "TEXT";
$USELINK	= "USELINK";
$USEMAP		= "USEMAP";

##------------------------------##
## SGML key character variables ##
##------------------------------##
$mdo	= '<!';		# Markup decleration open
$mdc	= '>';		# Markup decleration close
$mdo1char = '<';
$mdo2char = '!';

$mso	= '\['; 	# Marked section open
$msc	= '\]\]';	# Marked section close

$rni	= '#';		# Reserved name indicator

$ero	= '&';		# General entity reference open
$pero	= '%';		# Parameter entity reference open
$cro	= '&#';		# Character reference open
$refc	= ';';		# Reference close

$dso	= '\[';		# Doc type decleration subset open
$dsc	= '\]';		# Doc type decleration subset close

$como	= '--';		# Comment open
$comc	= '--';		# Comment close
$comchar = '-';

$grpo	= '\(';		# Group open
$grpc	= '\)';		# Group close
$seq	= ',';		# Sequence connector
$and	= '&';		# And connector
$or	= '\|';		# Or connector
$opt	= '\?';		# Occurs zero or one time
$plus	= '\+';		# Occurs one or more times
$rep	= '\*';		# Occurs zero or more times
$inc	= '\+';		# Inclusion
$exc	= '-';		# Exclusion

$quotes	= q/'"/;	# Quote characters

##---------------------##
## SGML misc variables ##
##---------------------##
$pubtl	   = 'EN';	# Regular expr repesenting public text language
			# codes.  Additional languages codes can be added
			# by separating them with the '|' character with
			# no whitespaces.  This variable is used when
			# parsing the entity map file.

$namechars = '\w-\.';	# Regular expr repesenting characters in tag/entity
			# names.

$keywords = "$CDATA|$CONREF|$CURRENT|$EMPTY|$ENTITY|$ENTITIES|$FIXED|".
	    "$ID|$IDREF|$IDREFS|$IMPLIED|$NAME|$NAMES|$NDATA|$NMTOKEN|".
	    "$NMTOKENS|$NOTATION|$NUMBER|$NUMBERS|$NUTOKEN|$NUTOKENS|$PCDATA|".
	    "$RCDATA|$REQUIRED|$SDATA";

$elem_keywords = "$rni$PCDATA|$RCDATA|$CDATA|$EMPTY";
$attr_keywords = "$CDATA|$ENTITY|$ENTITIES|$ID|$IDREF|$IDREFS|$NAME|$NAMES|".
		 "$NMTOKEN|$NMTOKENS|$NOTATION|$NUMBER|$NUMBERS|$NUTOKEN|".
		 "$NUTOKENS";

##----------------------------##
## Entity maps: <!ENTITY ...> ##
##----------------------------##
%ParEntity	= ();	# Parameter entities
%PubParEntity	= ();	# External public parameter entities (PUBLIC)
%SysParEntity	= ();	# External system parameter entities (SYSTEM)
%GenEntity	= ();	# General entities
%StartTagEntity	= ();	# Start tag entities (STARTTAG)
%EndTagEntity	= ();	# End tag entities (ENDTAG)
%MSEntity	= ();	# Marked section entities (MS)
%MDEntity	= ();	# Markup decleration entities (MD)
%PIEntity	= ();	# Processing instruction entities (PI)
%CDataEntity	= ();	# Character data entities (CDATA)
%SDataEntity	= ();	# System data entities (SDATA)
%PubEntity	= ();	# External public entities (PUBLIC)
%SysEntity	= ();	# External system entities (SYSTEM)
%SysCDEntity	= ();	# Ext sys character data entities (SYSTEM CDATA)
%SysNDEntity	= ();	# Ext sys non-SGML data entities (SYSTEM NDATA)
%SysSDEntity	= ();	# Ext sys specific character entities (SYSTEM SDATA)
%SysSubDEntity	= ();	# Ext sys sub document entities (SYSTEM SUBDOC)

%Entity2Sys	= ();	# Map of entity names (or identifiers) to filenames.
			# This assoc array is used for PUBLIC and SYSTEM
			# external entities.

##--------------------------------##
## Notation maps: <!NOTATION ...> ##
##--------------------------------##
%SysNotation	= ();	# Valid notation names for SYSTEM entities
%PubNotation	= ();	# Valid notation names for PUBLIC entities

##---------------------------------##
## Short Ref maps: <!SHORTREF ...> ##
##---------------------------------##
%ShortRef	= ();	# Short ref mappings
%UseMap		= ();	# Element names associated to short ref (<!USEMAP ...>)

##------------------------------##
## Element maps: <!ELEMENT ...> ##
##------------------------------##
%ElemCont	= ();	# Base content of elements
%ElemInc	= ();	# Inclusion set
%ElemExc	= ();	# Exclusion set
%ElemTag	= ();	# Omitted tag minimization

##-----------------------------##
## Element map: <!ATTLIST ...> ##
##-----------------------------##
%Attribute	= ();	# Attributes for elements

##  %Attribute Description
##  ----------------------
##  The array is indexed by element names.  The value of each entry is the
##  name of an associative array which is indexed by the attribute names
##  for the element.  The associative array can be accessed via Perl's eval
##  operator.
##
##	Eg. Retrieve associative array of attributes for element 'para':
##
##		%attr = eval "%dtd'$dtd'Attribute{'para'}";
##
##	    You need the "dtd'" to qualify the variables since they were
##	    defined in package dtd (unless in package dtd).
##
##  The values of the attibute names' array contain a string of characters
##  separated by the $; variable.  Do a split on $; to get an array of all
##  possible values for an attribute name.
##
##	Eg. Retrieve possible values for 'para' attribute 'alignment':
##
##		@values = split(/$;/, $attr{'alignment'});
##
##  The first array value of the $; splitted array is the default value for
##  the attribute, and the other array values are all posible values for the
##  attribute.
##

##--------------##
## Function map ##
##--------------##
%Function = (
    $ATTLIST,	'do_attlist',
    $ELEMENT,	'do_element',
    $ENTITY,	'do_entity',
    $SHORTREF,	'do_shortref',
    $USEMAP,	'do_usemap',
);

##------------------------------------##
## Environment/Command-line Variables ##
##------------------------------------##
@P_SGML_PATH = split(/:/, $ENV{'P_SGML_PATH'}) if $ENV{'P_SGML_PATH'};
push(@P_SGML_PATH, '.', 'ents');
#-------#  @P_SGML_PATH defines a list of paths for searching for external
	#  entity references.  The user can define the environment
	#  variable P_SGML_PATH to tell the dtd libaray which paths to
	#  search.  The paths listed must be ':' separated.
	#-------------------------------------------------------------------

##***************************************************************************##
##			    DATA ACCESS ROUTINES			     ##
##***************************************************************************##
                            ##----------------##
                            ## Main Functions ##
                            ##----------------##
##---------------------------------------------------------------------------
##	DTDget_elements() retrieves all the elements defined in the DTD.
##
sub main'DTDget_elements {
    return sort keys %ElemCont;
}
##---------------------------------------------------------------------------
##	DTDget_elem_attr() retrieves an associative array defining the
##	attributes associated with element $elem.
##
sub main'DTDget_elem_attr {
    local($elem) = shift @_;
    local(%attr);

    $elem =~ tr/A-Z/a-z/;
    %attr = eval "%$Attribute{$elem}" if $Attribute{$elem};
    %attr;
}
##---------------------------------------------------------------------------
##	DTDget_top_elements() retrieves the top-most elements in the DTD.
##
sub main'DTDget_top_elements {
    &compute_parents() unless defined(%Parents);
    return sort keys %TopElement;
}
##---------------------------------------------------------------------------
sub main'DTDis_keyword {
    local($word) = shift;
    ($word =~ /^\s*$rni?($keywords)\s*$/oi ? 1 : 0);
}
##---------------------------------------------------------------------------
sub main'DTDis_attr_keyword {
    local($word) = shift;
    ($word =~ /^\s*($attr_keywords)\s*$/oi ? 1 : 0);
}
##---------------------------------------------------------------------------
sub main'DTDis_elem_keyword {
    local($word) = shift;
    ($word =~ /^\s*($elem_keywords)\s*$/oi ? 1 : 0);
}
##---------------------------------------------------------------------------
sub main'DTDget_parents {
    local($elem) = shift;

    $elem =~ tr/A-Z/a-z/;
    &compute_parents() unless defined(%Parents);
    return sort split(' ', $Parents{$elem});
}
##---------------------------------------------------------------------------
sub main'DTDreset {
    undef %ParEntity;
    undef %PubParEntity;
    undef %SysParEntity;
    undef %GenEntity;
    undef %StartTagEntity;
    undef %EndTagEntity;
    undef %MSEntity;
    undef %MDEntity;
    undef %PIEntity;
    undef %CDataEntity;
    undef %SDataEntity;
    undef %PubEntity;
    undef %SysEntity;
    undef %SysCDEntity;
    undef %SysNDEntity;
    undef %SysSDEntity;
    undef %SysSubDEntity;
    undef %Entity2Sys;
    undef %SysNotation;
    undef %PubNotation;
    undef %ShortRef;
    undef %UseMap;
    undef %ElemCont;
    undef %ElemInc;
    undef %ElemExc;
    undef %ElemTag;
    undef %Attribute;
    undef %Parents;
    undef %TopElement;
}
##---------------------------------------------------------------------------
                            ##---------------##
                            ## DTD Functions ##
                            ##---------------##
##---------------------------------------------------------------------------
##	compute_parents() generates the %Parents and %TopElement arrays.
##
sub compute_parents {
    local($elem, %exec);

    foreach $elem (&'DTDget_elements()) {
        foreach (&extract_elem_names($ElemExc{$elem})) { $exc{$_} = 1; }
        foreach (&extract_elem_names($ElemCont{$elem})) {
            $Parents{$_} .= ($Parents{$_} ? ' ' : '') . $elem
                unless $exc{$_} || &'DTDis_elem_keyword($_);
        }
        foreach (&extract_elem_names($ElemInc{$elem})) {
            $Parents{$_} .= ($Parents{$_} ? ' ' : '') . $elem
                unless $exc{$_} || &'DTDis_elem_keyword($_);
        }
        undef %exc;
    }
    foreach (keys %ElemCont) {
	$TopElement{$_} = 1 if !$Parents{$_} || $Parents{$_} eq $_;
    }
}
##---------------------------------------------------------------------------

##***************************************************************************##
##				PARSE ROUTINES				     ##
##***************************************************************************##
##---------------------------------------------------------------------------
##  Main routines defined:
##
##	sub main'DTDread_dtd	    -- Parse a SGML dtd
##	sub main'DTDread_mapfile    -- Parse entity map file
##
##  DTD routines defined:
##      sub del_comments            -- Delete in-line SGML comments
##	sub do_attlist		    -- Parse a ATTLIST statement
##	sub do_element		    -- Parse a ELEMENT statement
##	sub do_entity		    -- Parse a ENTITY statement
##	sub do_gen_entity	    -- Parse a general entity
##	sub do_shortref		    -- Parse a SHORTREF statement
##	sub do_usemap		    -- Parse a USEMAP statement
##	sub expand_parm_entities    -- Expand an parameter ent reference
##	sub extract_elem_names	    -- Retrieve element names from a string
##	sub find_ext_parm_ref	    -- Find external parm entities in string
##	sub get_exc		    -- Get exclusion element set
##	sub get_inc		    -- Get inclusion element set
##	sub get_next_group	    -- Get next group
##	sub get_next_string	    -- Get next quoted string
##	sub is_quote_char	    -- Check character if it is a quote
##	sub open_ext_entity	    -- Open an external entity file
##	sub quote_chars		    -- Escapes chars for pattern matches
##	sub read_comment	    -- Read comment from file handle
##	sub read_decleration	    -- Read decleration from file handle
##	sub read_doctype	    -- Read DOCTYPE from file handle
##	sub read_linktype	    -- Read LINKTYPE from file handle
##	sub read_msection	    -- Read marked section from file handle
##	sub read_subset		    -- Read subset from file handle
##	sub resolve_ext_entity_ref  -- External entity -> filename
##	sub subset_error	    -- Die if error in subset
##	sub update_top_elems	    -- Update %TopElement array
##	sub zip_wspace		    -- Delete extra whitespace
##
##  Notes:
##	The parsing routines have a specific calling sequence.  Many
##	of the routines rely on other routines updating the current
##	parsed line.  Many of them pass the current line by reference.
##
##	See individual routine declerations for more information.
##
##---------------------------------------------------------------------------

$IncMS	= 1;
$IgnMS	= 2;
                            ##----------------##
                            ## Main Functions ##
                            ##----------------##
##---------------------------------------------------------------------------
##	DTDread_dtd() parses the contents of an open file specified by
##	$handle.
##
sub main'DTDread_dtd {
    local($handle, $include) = @_;
    local($line);
    local($oldslash) = $/;
    local($old) = select($handle);

    return if $include == $IgnMS;
    $include = $IncMS unless $include;
    while (!eof($handle)) {
        $/ = $mdo;  $/ =~ s/\\//g;
        $line = <$handle>;              	# Read 'til first decleration
        &find_ext_parm_ref(*line, $include)	# Read any external files
	    if $include == $IncMS;
        last if eof($handle);           	# Exit if EOF
        &read_decleration($handle, $include)	# Read decleration
    }
    select($old);
    $/ = $oldslash;
}
##---------------------------------------------------------------------------
##	DTDread_mapfile() opens and parse the entity map file specified
##	by $filename.
##
sub main'DTDread_mapfile {
    local($filename) = @_;
    local($id, $file, $tmp);

    foreach (@P_SGML_PATH) {
	if (open(MAPFILE, "$_/$filename")) { $tmp = 1; last; }
    }
    warn "Unable to open entity map file: $filename\n", return
	unless $tmp;

    while (<MAPFILE>) {
	next if /^\s*$/ || /^#/;	# Skip blank/comment lines
	chop;  s/#.*$//;		# Delete end of line comments
	if (m%//($pubtl)\s+%) {
	    ($id, $tmp, $file) = $_ =~ m%^(.+)($pubtl)\s+(\S+)%;
	    $id .= $tmp;
	}
	else {
	    ($id, $file) = $_ =~ /^(\S+)\s+(\S+)/;
	}
	&zip_wspace($id);
	$Entity2Sys{$id} = $file  if ($id && $file);
    }
    close(MAPFILE);
}
##---------------------------------------------------------------------------
                            ##---------------##
                            ## DTD Functions ##
                            ##---------------##
##---------------------------------------------------------------------------
##	read_decleration() parses a decleration.  $include determines
##	if the decleration is to be included or ignored.
##
sub read_decleration {
    local($handle, $include) = @_;
    local($c, $line, $func, $tmp, $i, $q);

    $c = getc($handle);
    &read_comment($handle), return		# Comment decleration
	if $c =~ /^$comchar$/o;
    &read_msection($handle, $include), return	# Marked section
	if $c =~ /^$mso$/o;

    $func = $c;
    while ($c !~ /^\s*$/) {             # Get decleration type
        $c = getc($handle);
        $func .= $c;
    }
    chop $func;
    &read_doctype($handle, $include), return	# DOCTYPE decleration
	if $func =~ /$DOCTYPE/o;
    &read_linktype($handle, $include), return	# LINKTYPE decleration
	if $func =~ /$LINKTYPE/o;

    while ($c ne $mdc) {		# Get rest of decleration
        $c = getc($handle);
        if ($c =~ /^$comchar$/o) {
            $i = getc($handle);
            if ($i =~ /^$comchar$/o) {  	# Remove in-line comments
                $/ = $comc;  $/ =~ s/\\//g;
                $tmp = <$handle>;
            }
	    elsif (&is_quote_char($i)) {
		$/ = $i;  $tmp = <$handle>;
		$line .= $c . $i . $tmp;
	    }
            else { $line .= $c . $i }
        }
	elsif (&is_quote_char($c)) {		# Check for quoted string
	    $/ = $c;  $tmp = <$handle>;
	    $line .= $c . $tmp;
	}
        else { $line .= $c }
    }
    chop $line;
    $line =~ s/\n/ /g;
    eval "&$Function{$func}(\$line)"	# Interpret decleration
	if $include == $IncMS;
}
##---------------------------------------------------------------------------
##	read_comment() slurps up a comment decleration.
##
sub read_comment {
    local($handle) = @_;
    local($old) = select($handle);
    local($d) = $/;
    $/ = $comc . $mdc;  $/ =~ s/\\//g;
    <$handle>;
    $/ = $d;
    select($old);
}
##---------------------------------------------------------------------------
##	read_doctype() parses a DOCTYPE decleration.  $include determines
##	if the decleration is to be included or ignored.
##
sub read_doctype {
    local($handle, $include) = @_;
    local($line);

    $/ = $dso;  $/ =~ s/\\//g;
    $line = <$handle>;                  # Get text before $dso
    &read_subset($handle, $include, $dsc.$mdc);
}
##---------------------------------------------------------------------------
##	read_linktype() parses a LINKTYPE decleration.  $include determines
##	if the decleration is to be included or ignored.
##
sub read_linktype {
    local($handle, $include) = @_;
    local($line);

    $/ = $dso;  $/ =~ s/\\//g;
    $line = <$handle>;                  # Get text before $dso
    &expand_parm_entities(*line);
    print STDERR "$LINKTYPE decleration ignored\n";
    &read_subset($handle, $IgnMS, $dsc.$mdc);
}
##---------------------------------------------------------------------------
##	read_msection() parses marked section.  $include determines
##	if the section is to be included or ignored.
##
sub read_msection {
    local($handle, $include) = @_;
    local($line, $c);

    $/ = $dso;  $/ =~ s/\\//g;
    $line = <$handle>;                  # Get status keyword
    &expand_parm_entities(*line);
    $include = $IgnMS if $line !~ /^\s*$INCLUDE/o;
    &read_subset($handle, $include, $msc.$mdc);
}
##---------------------------------------------------------------------------
##	read_subset() parses any subset section.  $include determines
##	if the subset is included or ignored.  $endseq signifies the
##	end delimiting sequence of the subset.
##
sub read_subset {
    local($handle, $include, $endseq) = @_;
    local($c, $i, $line);
    $endseq =~ s/\\//g;
    local(@chars) = split(//, $endseq);
    foreach (@chars) { &quote_chars(*_); }

    while (1) {
        $c = getc($handle);  next if $c =~ /^\s*$/;
        if ($c =~ /^$mdo1char$/o) {     	# Decleration statement
            $c = getc($handle);
            &subset_error($c) unless $c =~ /^$mdo2char$/o;
            &read_decleration($handle, $include);
        }
        elsif ($c =~ /^$chars[0]$/) {		# End of subset section
	    for ($i=1; $i <= $#chars; ) {
		$c = getc($handle);
		if ($c =~ /^$chars[$i]$/) { $i++; }	# Part of $endseq
		elsif ($c =~ /[\s]/) { next; }		# Whitespace
		else { last; }
	    }
            return if $i > $#chars;
        }
        elsif ($c =~ /^$pero$/o) {		# Ext parm entity ref
            $line = $c;
            while (1) {
                $c = getc($handle);
                if ($c =~ /[$namechars]/o) { $line .= $c; }
                else { last; }
            }
            &find_ext_parm_ref(*line, $include) if $include == $IncMS;
        }
        else {
	    &subset_error($c);
        }
    }
}
##---------------------------------------------------------------------------
##	find_ext_parm_ref() evaulates in external parameter entity
##	references in *line.  $include is the INCLUDE/IGNORE flag
##	that is passed to DTDread_dtd.
##
sub find_ext_parm_ref {
    local(*line, $include) = @_;
    local($i, $tmp);
    while ($line =~ /$pero/) {
        $line =~ s/$pero([$namechars]+)$refc?//o;
        if (($i = &resolve_ext_entity_ref($1)) &&
            ($tmp = &open_ext_entity($i))) {
                &'DTDread_dtd($tmp, $include);
                close($tmp);
        }
    }
}
##---------------------------------------------------------------------------
##	subset_error() prints out a terse error message and dies.  This
##	routine is called if there is a syntax error in a subset section.
##
sub subset_error {
    local($c) = @_;
    die "Syntax error in subset.\n",
        "\tUnexpected character: $c\n";
}
##---------------------------------------------------------------------------
##	do_attlist() parses an ATTLIST definition.
##
sub do_attlist {
    local($line) = @_;
    local($tmp, $attname, $attvals, $attdef, %attr, @array);

    $tmp = &get_next_group(*line);	 # Get element name(s)
    if ($tmp =~ /^$grpo/) {
	$tmp =~ s/($grpo|$grpc|\s+)//go;
    }
    &expand_parm_entities(*tmp);
    $tmp =~ tr/A-Z/a-z/;		 # Convert all names to lowercase
    @names = split(/[\|&,\s]+/, $tmp);
    &expand_parm_entities(*line);
    while ($line !~ /^\s*$/) {
	$attname = &get_next_group(*line);
	$attvals = &get_next_group(*line);
	$attdef  = &get_next_group(*line);
	$attdef  =~ s/['"]//g;

	$attvals =~ s/[\(\)\s]//g;
	@array = split(/[$seq$and$or]/, $attvals);
	$attr{$attname} = join($;, $attdef, @array);
    }
    foreach (@names) {
	eval "%${_}_attr = %attr";
	$Attribute{$_} = "${_}_attr";
    }
}
##---------------------------------------------------------------------------
##	do_element() parses an ELEMENT definition.
##
sub do_element {
    local($line) = @_;
    local($tmp, @names, $tagm, $elcont, $elinc, $elexc);

    $tmp = &get_next_group(*line);	 # Get element name(s)
    if ($tmp =~ /^$grpo/) {
	$tmp =~ s/($grpo|$grpc|\s+)//go;
    }
    &expand_parm_entities(*tmp);
    $tmp =~ tr/A-Z/a-z/;		 # Convert all names to lowercase
    @names = split(/[\|&,\s]+/, $tmp);

    $line =~ s/^([-Oo]{1}\s+[-Oo]{1})\s+//; 	 # Get tag minimization
    $tagm = $1; $tagm = "- -" unless $tagm;

    $elcont = &get_next_group(*line);	 # Get content
    &expand_parm_entities(*elcont);

    if ($elcont ne $EMPTY) {		 # Get inclusion/exclusion groups
	$elcont =~ tr/A-Z/a-z/;
	while ($line !~ /^\s*$/) {
	    if ($line =~ /^$inc/) { $elinc = &get_inc(*line); }
	    elsif ($line =~ /^$exc/) { $elexc = &get_exc(*line); }
	    else { last; }
	}
	&expand_parm_entities(*elinc);
	&expand_parm_entities(*elexc);
	$elinc =~ tr/A-Z/a-z/;
	$elexc =~ tr/A-Z/a-z/;
    }

    foreach (@names) {			# Store element information
	if (defined($ElemCont{$_})) {
	    warn "Duplicate element decleration: $_\n"; }
	else {
	    ## OLD ## &update_top_elems($_, $elcont . $seq . $elinc); ##

	    $ElemCont{$_} = $elcont;
	    $ElemInc{$_} = $elinc;
	    $ElemExc{$_} = $elexc;
	    $ElemTag{$_} = $tagm;
	}
    }
}
##---------------------------------------------------------------------------
##	do_entity() parses an ENTITY definition.
##
sub do_entity {
    local($line) = @_;

    if ($line =~ /^\s*$pero/) { &do_parm_entity(*line); }
    else { &do_gen_entity(*line); }
}
##---------------------------------------------------------------------------
sub do_shortref {
    local($line) = @_;
    print STDERR "$SHORTREF decleration ignored\n";
}
##---------------------------------------------------------------------------
sub do_usemap {
    local($line) = @_;
    print STDERR "$USEMAP decleration ignored\n";
}
##---------------------------------------------------------------------------
##      del_comments() removes any inline comments from *line.
##      Unfortuneatly, this routines needs knowledge of the comment
##      delimiters.  If the deliminters are changed, this routine
##      must be updated.
##
sub del_comments {
    local(*line) = @_;
    $line =~ s/$como([^-]|[^-]-{1}[^-])*$comc//go;
}
##---------------------------------------------------------------------------
##	expand_parm_entities() expands all parameter entity references
##	in *line.
##
sub expand_parm_entities {
    local(*line) = @_;

    while ($line =~ s/$pero([$namechars]+)$refc?/$ParEntity{$1}/) {
	&del_comments(*line);
    };
}
##---------------------------------------------------------------------------
##	extract_elem_names() extracts just the element names of $str.
##	An array is returned.  The elements in $str are assumed to
##	separated by connectors.
##
sub extract_elem_names {
    local($str) = @_;
    $str =~ s/[$grpo$grpc$opt$plus$rep$inc\s]//go;
    return split(/[$seq$and$or]/, $str);
}
##---------------------------------------------------------------------------
##	open_ext_entity() opens the external entity file $filename.
##
sub open_ext_entity {
    local($filename) = @_;
    local($ret);

    foreach (@P_SGML_PATH) {
	if (open(EXTENT, "$_/$filename")) {
	    $ret = 'EXTENT';
	    last;
	}
    }
    warn "Unable to open $filename\n" unless $ret;
    $ret;
}
##---------------------------------------------------------------------------
##	resolve_ext_entity_ref() translates an external entity to
##	its corresponding filename.  The entity identifier is checked
##	first for resolution.  If that fails, then the entity name
##	itself is used for resolution.
##
sub resolve_ext_entity_ref {
    local($ent) = @_;
    local($aa, $tmp);

    EREFSW: {
	$aa = "PubParEntity", last EREFSW if defined($PubParEntity{$ent});
	$aa = "SysParEntity", last EREFSW if defined($SysParEntity{$ent});
	$aa = "PubEntity", last EREFSW if defined($PubEntity{$ent});
	$aa = "SysEntity", last EREFSW if defined($SysEntity{$ent});
	$aa = "SysCDEntity", last EREFSW if defined($SysCDEntity{$ent});
	$aa = "SysNDEntity", last EREFSW if defined($SysNDEntity{$ent});
	$aa = "SysSDEntity", last EREFSW if defined($SysSDEntity{$ent});
	$aa = "SysSubDEntity", last EREFSW if defined($SysSubDEntity{$ent});
	warn "Entity referenced, but not defined: $ent\n", return "";
    }
    $tmp = eval "\$Entity2Sys{\$$aa{\$ent}}"; return $tmp if $tmp;
    return $Entity2Sys{$ent} if $Entity2Sys{$ent};
    warn "Unable to resolve external entity ref: $ent\n";
    "";
}
##---------------------------------------------------------------------------
##	do_parm_entity() parses a parameter entity definition.
##
sub do_parm_entity {
    local(*line) = @_;
    local($name);

    $line =~ s/^\s*$pero?\s+//o;	  # Remove pero, '%'
    $line =~ s/^(\S+)\s+//; $name = $1;   # Get entity name
    if ($line =~ /^$PUBLIC/) {		  # PUBLIC external parm entity
	$line =~ s/^$PUBLIC\s+//o;
	$PubParEntity{$name} = &get_next_string(*line)
	    unless defined($PubParEntity{$name});
    }
    elsif ($line =~ /^$SYSTEM/) {	  # SYSTEM external parm entity
	$line =~ s/^$SYSTEM\s+//o;
	$SysParEntity{$name} = &get_next_string(*line)
	    unless defined($SysParEntity{$name});
    }
    else {				  # Regular parm entity
	$ParEntity{$name} = &get_next_string(*line)
	    unless defined($ParEntity{$name});
    }
}
##---------------------------------------------------------------------------
##	do_gen_entity() parses a general entity definition.
##
sub do_gen_entity {
}
##---------------------------------------------------------------------------
##	get_inc() gets the inclusion element group of an element
##	definition.
##
sub get_inc {
    local(*line) = @_;
    local($ret);
    $line =~ s/^\s*$inc?\s*//o;
    $ret = &get_next_group(*line);
    $ret;
}
##---------------------------------------------------------------------------
##	get_exc() gets the exclusion element group of an element
##	definition.
##
sub get_exc {
    local(*line) = @_;
    local($ret);
    $line =~ s/^\s*$exc?\s*//o;
    $ret = &get_next_group(*line);
    $ret;
}
##---------------------------------------------------------------------------
##	get_next_group gets the next group from a decleration.
##
sub get_next_group {
    local(*line) = @_;
    local($o, $c, $tmp, $ret);
    $line =~ s/^\s*//;
    if ($line =~ /^$grpo/o) {
	$o = 1;
	while ($o > $c) {
	    $line =~ s/^([^$grpc]*$grpc\S?)//o;
	    $ret .= $1;
	    $tmp = $ret;
	    $o = $tmp =~ s/$grpo//go;
	    $c = $tmp =~ s/$grpc//go;
	}
	$line =~ s/^\s*//;
    }
    else {
	$line =~ s/^(\S+)\s*//; $ret = $1; }
    &zip_wspace(*ret);
    $ret;
}
##---------------------------------------------------------------------------
##	get_next_string() gets the next string from *line.  This
##	function is used by the do*entity routines.
##
sub get_next_string {
    local(*line) = @_;
    local($ret, $q);

    $line =~ s/^\s*(['"])//; $q = $1;
    $line =~ s/^([^$q]*)\s*//; $ret = $1;
    &zip_wspace(*ret);
    $ret;
}
##---------------------------------------------------------------------------
##	update_top_elems() updates the %TopElement and
##	%NonTopElement arrays for keeping track of element hierarchy.
##
sub update_top_elems {
    local($elem, $contents) = @_;
    local(@array);

    @array = &extract_elem_names($contents);
    $TopElement{$elem} = 1  if !defined($NonTopElement{$elem});
    foreach (@array) {
	delete $TopElement{$_};
	$NonTopElement{$_} = 1;
    }
}
##---------------------------------------------------------------------------
##	is_quote_char() checks to see if $char is a quote character.
##	$quotes is defined in dtd_vars.pl.
##
sub is_quote_char {
    local($char) = @_;
    &quote_chars(*char);
    $quotes =~ /$char/;
}
##---------------------------------------------------------------------------
##      zip_wspace() takes a pointer to a string and strips all beginning
##      and ending whitespaces.  It also compresses all other whitespaces
##      into a single space character.
##
sub zip_wspace {
    local(*str) = @_;
    $str =~ s/^\s*(.*[^\s])\s*$/\1/;
    $str =~ s/\s{2,}/ /g;
}
##---------------------------------------------------------------------------
##      quote_chars() escapes special characters in case passed in string
##      will get be used in a pattern matching statement.  This prevents
##      the string from causing perl to barf because the string happens
##      to contain characters that have special meaning in pattern
##      matches.
##
sub quote_chars {
    local(*str) = @_;
    $str =~ s/([\[\]\(\)\.\^\{\}\$\*\?\+\\\|])/\\\1/g;
}
##---------------------------------------------------------------------------

##***************************************************************************##
##				TREE ROUTINES				     ##
##***************************************************************************##
##---------------------------------------------------------------------------##
##  Main routines defined:
##
##	sub main'DTDprint_tree		-- Output tree for an element
##
##  DTD routines defined:
##
##	sub print_elem			-- Print formated tree element
##	sub print_sub_tree		-- Does grunt work for main routines
##
##  Notes:
##      See individual routine declerations for more information.
##
##---------------------------------------------------------------------------##

$MAXLEVEL = 5;		# Default tree depth (root element has depth = 1)
$TREEFILE = 'STDOUT';	# Default output file

			    ##----------------##
			    ## Main Functions ##
			    ##----------------##
##---------------------------------------------------------------------------
##	DTDprint_tree() outputs the tree hierarchy of $elem to the
##	filehandle specified by $handle.  $depth specifies the maximum
##	depth of the tree.
##
sub main'DTDprint_tree {
    local($elem, $depth, $handle) = @_;
    local(%inc, %exc, %done, %open);
    $MAXLEVEL = $depth if ($depth > 0);
    $TREEFILE = $handle if $handle;
    &print_elem($elem, 1);
    $elem =~ tr/A-Z/a-z/;
    &print_sub_tree($elem, 2, *inc, *exc, *done);
}
##---------------------------------------------------------------------------
			    ##---------------##
			    ## DTD Functions ##
			    ##---------------##
##---------------------------------------------------------------------------
##	print_sub_tree() does the grunt work of outputing an element's
##	content hierarchy.  The routine cuts at elements that exist at
##	higher (or equal) levels or if $MAXLEVEL has been reached.  The
##	string "..." is appended to an element if has been cut-off due
##	to pre-existance at a higher (or equal) level.
##
##	Cutting the tree at repeat elements is necessary to avoid
##	a combinatorical explosion with recursive element definitions.
##	Plus, it does not make much since to repeat information.
##
##	Note: Higher, or equal level cousins are not recognized.
##	 Cut-offs are only determined from siblings, ancestors, and
##	 ancestors' siblings (ie. aunts & uncles).  Therefore, some
##	 sub-element content hierarchies may be repeated.
##	 
##	 In order to recognize cousins, a breadth first search is needed,
##	 or a full traversal of the hierarchy before outputing.  The
##	 above technique currently is sufficient to avoid combinatorical
##	 explosions.  Plus, it allows me to print out the tree while 
##	 traversing the element data.  There is no need to create a
##	 Perl tree data structure before printing (saves time, memory,
##	 and debugging).
##	 
##	 Unless deemed absolutely necessary, the above techique will not
##	 be changed.
##
sub print_sub_tree {
    local($elem, $level, *inc, *exc, *done, *open) = @_;
    local($tmp, $i, @array, @incarray, @excarray, %notdone, %lexc);

    return if $level > $MAXLEVEL;

    $done{$elem} = $level if ($level < $done{$elem} || !$done{$_});

    ## List inclusion elements due to ancestors ##
    @incarray = sort grep($inc{$_} > 0, sort keys %inc);
    if ($#incarray >= 0 ) {
	$tmp = '(Ia):';
	foreach (@incarray) { $tmp .= ' ' . $_; }
	&print_elem($tmp, $level, *open);
    }

    ## List exclusion elements due to ancestors ##
    @excarray = sort grep($exc{$_} > 0, sort keys %exc);
    if ($#excarray >= 0 ) {
	$tmp = '(Xa):';
	foreach (@excarray) { $tmp .= ' ' . $_; }
	&print_elem($tmp, $level, *open);
    }

    ## Get inclusion elements ##
    @incarray = sort &extract_elem_names($ElemInc{$elem});
    $tmp = '(I):' if $#incarray >= 0;
    foreach (@incarray) {
	$inc{$_}++;
	$tmp .= ' ' . $_;
    }
    &print_elem($tmp, $level, *open) if $#incarray >= 0;

    ## Get element contents ##
    @array = (@incarray, &extract_elem_names($ElemCont{$elem}));
    &remove_dups(*array);
    foreach (@array) {
	next if /^\s*$rni?($PCDATA|$RCDATA|$SDATA|$CDATA|$EMPTY)\s*$/oi;
	$done{$_} = $level+1, $notdone{$_} = 1
	    if ($level+1 < $done{$_} || !$done{$_});
    }

    ## Get exclusion elements ##
    @excarray = sort &extract_elem_names($ElemExc{$elem});
    $tmp = '(X):' if $#excarray >= 0;
    foreach (@excarray) {
	$exc{$_}++; $lexc{$_} = 1;
	$tmp .= ' ' . $_;
    }
    &print_elem($tmp, $level, *open) if $#excarray >= 0;

    &print_elem(' |', $level, *open);

    ## Output sub tree ##
    $i = 0;
    foreach (sort @array) {
	$open{$level} = ($i < $#array ? 1 : 0); $i++;
	s/^\s*($rni?)($PCDATA|$RCDATA|$SDATA|$CDATA|$EMPTY)\s*$/\1\U\2/oi;
	if (!$lexc{$_}) {
	    &print_elem($_ . ($done{$_} && !$notdone{$_} ? " ..." : ""),
			$level, *open);
	    &print_sub_tree($_, $level+1, *inc, *exc, *done, *open),
		$notdone{$_} = 0  if ($level < $MAXLEVEL &&
				      ($level+1 < $done{$_} || $notdone{$_}));
	}
    }
    &print_elem("", $level, *open);

    ## Remove include elements ##
    foreach (@incarray) { $inc{$_}--; }

    ## Remove exclude elements ##
    foreach (@excarray) { $exc{$_}--; }
}
##---------------------------------------------------------------------------
##	print_elem() is used by print_sub_tree() to output the elements
##	in a structured format to $TREEFILE.
##
##	Arguments:
##		$elem : String to print out
##		$level : Level of $elem
##		*open : Pointer to assoc array telling which levels are
##			still open.
##
sub print_elem {
    local($elem, $level, *open) = @_;
    local($i, $indent);
    if ($level == 1) {
	print($TREEFILE $elem, "\n"); }
    else {
	for ($i=2; $i < $level; $i++) {
	    $indent .= ($open{$i} ? " | " : "   "); }
	if ($elem ne "") {
	    if ($elem =~ /\(/) { $indent .= " | "; }
	    elsif ($elem !~ /\|/) { $indent .= " |_"; }
	}
	print($TREEFILE $indent, $elem, "\n");
    }
}
##---------------------------------------------------------------------------
sub remove_dups {
    local(*array) = shift;
    local(%dup);
    @array = grep($dup{$_}++ < 1, @array);
}
##---------------------------------------------------------------------------##

1;
