###########################################################################
#
# XMLPlug.pm -- base class for XML plugins
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2001 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package XMLPlug;

use BasPlug;
use doc;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

sub BEGIN {
    @XMLPlug::ISA = ('BasPlug');
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
}

use XMLParser;

my $arguments =
    [ { 'name' => "process_exp",
	'desc' => "{BasPlug.process_exp}",
	'type' => "regexp",
	'deft' => &get_default_process_exp(),
	'reqd' => "no" },
      { 'name' => "xslt",
	'desc' => "{XMLPlug.xslt}",
	'type' => "string",
	'deft' => "",
	'reqd' => "no" } ];

my $options = { 'name'     => "XMLPlug",
		'desc'     => "{XMLPlug.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
		'args'     => $arguments };


our ($self);
sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    
    # $self is global for use within subroutines called by XML::Parser
    $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);

    if ($self->{'info_only'}) {
	# don't worry about any options etc
	return bless $self, $class;
    }

    my $parser = new XML::Parser('Style' => 'Stream',
				 'Handlers' => {'Char' => \&Char,
						'XMLDecl' => \&XMLDecl,
						'Entity' => \&Entity,
						'Doctype' => \&Doctype,
						'Default' => \&Default,
					    });  

    $self->{'parser'} = $parser;

    return bless $self, $class;
}

# the inheriting class must implement this method to tell whether to parse this doc type
sub get_doctype {
    my $self = shift(@_);
    die "$self The inheriting class must implement get_doctype method";
}


sub apply_xslt
{
    my $self = shift @_;
    my ($xslt,$filename) = @_;
    
    my $outhandle = $self->{'outhandle'};

    my $xslt_filename = $xslt;

    if (! -e $xslt_filename) {
	# Look in main site directory
	my $gsdlhome = $ENV{'GSDLHOME'};
	$xslt_filename = &util::filename_cat($gsdlhome,$xslt);
    }

    if (! -e $xslt_filename) {
	# Look in collection directory
	my $coldir = $ENV{'GSDLCOLLECTDIR'};
	$xslt_filename = &util::filename_cat($coldir,$xslt);
    }

    if (! -e $xslt_filename) {
	print $outhandle "Warning: Unable to find XSLT $xslt\n";
	if (open(XMLIN,"<$filename")) {

	    my $untransformed_xml = "";
	    while (defined (my $line = <XMLIN>)) {

		$untransformed_xml .= $line;
	    }
	    close(XMLIN);
	    
	    return $untransformed_xml;
	}
	else {
	    print $outhandle "Error: Unable to open file $filename\n";
	    print $outhandle "       $!\n";
	    return "";
	}
	
    }

    my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
    my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
    my $xslt_base_cmd = "java -jar $jar_filename";
    my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";

    my $transformed_xml = "";

    if (open(XSLT_IN,"$xslt_cmd |")) {
	while (defined (my $line = <XSLT_IN>)) {

	    $transformed_xml .= $line;
	}
	close(XSLT_IN);
    }
    else {
	print $outhandle "Error: Unable to run command $xslt_cmd\n";
	print $outhandle "       $!\n";
    }

    return $transformed_xml;

}

sub check_doctype {
    $self = shift (@_);
    
    my ($filename) = @_;
    
    if (open(XMLIN,"<$filename")) {
	my $doctype = $self->get_doctype();
	## check whether the doctype has the same name as the root element tag
	while (defined (my $line = <XMLIN>)) {
	    ## find the root element
	    if ($line =~ /<([\w\d:]+)[\s>]/){
		my $root = $1;
		if ($root !~ $doctype){
		    close(XMLIN);
		    return 0;
		}
		else {
		    close(XMLIN); 
		    return 1;
		}
	    }
	}
	close(XMLIN);
    }
    
    return undef; # haven't found a valid line
    
}

# because we are not just using process_exp to determine whether to process or not, we need to implement this too, so that a file can be passed down if we are not actually processing it
sub metadata_read {
    $self = shift (@_);
    
    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
 
    my $result = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);

    if (defined $result) {
	# we think we are processing this, but check that we actually are
	my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;

	if ($self->check_doctype($filename)) {
	    return $result;
	}
    }
    return undef;
}

sub read {
    # $self must be global to work with XML callback routines.
    $self = shift (@_);  
  
    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;

    # Make sure we're processing the correct file, do blocking etc
    my ($block_status,$filename) = $self->read_block(@_);    
    return $block_status if ((!defined $block_status) || ($block_status==0));

    ## check the doctype to see whether we really want to process the file
    if (!$self->check_doctype($filename)) {
	# this file is not for us
	return undef;
    }

    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    $self->{'base_dir'} = $base_dir;
    $self->{'file'} = $file;
    $self->{'filename'} = $filename;
    $self->{'processor'} = $processor;
    $self->{'metadata'} = $metadata;

    eval {
	my $xslt = $self->{'xslt'};
	if (defined $xslt && ($xslt ne "")) {
	    # perform xslt
	    my $transformed_xml = $self->apply_xslt($xslt,$filename);

	    # feed transformed file (now in memory as string) into XML parser
	    $self->{'parser'}->parse($transformed_xml);
	}
	else {
	    $self->{'parser'}->parsefile($filename);
	}
    };
  
    if ($@) {

	# parsefile may either croak somewhere in XML::Parser (e.g. because
	# the document is not well formed) or die somewhere in XMLPlug or a
	# derived plugin (e.g. because we're attempting to process a
	# document whose DOCTYPE is not meant for this plugin). For the
	# first case we'll print a warning and continue, for the second
	# we'll just continue quietly

	print STDERR "**** Error is: $@\n";

	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
	if (defined $msg) {	
	    my $outhandle = $self->{'outhandle'};
	    my $plugin_name = ref ($self);
	    print $outhandle "$plugin_name failed to process $file ($msg)\n";
	}

	# reset ourself for the next document
	$self->{'section_level'}=0;
	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	return -1; # error during processing
    }

    
    return 1; # processed the file
}

# the following two methods are for if you want to do the parsing from a
# plugin that inherits from this. it seems that you can't call the parse 
# methods directly. WHY???
sub parse_file {
    $self = shift (@_); 
    my ($filename) = @_;
    $self->{'parser'}->parsefile($filename);
}

sub parse_string {
    $self = shift (@_); 
    my ($xml_string) = @_;
    $self->{'parser'}->parse($xml_string);
}

sub get_default_process_exp {
    my $self = shift (@_);

    return q^(?i)\.xml$^;
}

sub StartDocument {$self->xml_start_document(@_);}
sub XMLDecl {$self->xml_xmldecl(@_);}
sub Entity {$self->xml_entity(@_);}
sub Doctype {$self->xml_doctype(@_);}
sub StartTag {$self->xml_start_tag(@_);}
sub EndTag {$self->xml_end_tag(@_);}
sub Text {$self->xml_text(@_);}
sub PI {$self->xml_pi(@_);}
sub EndDocument {$self->xml_end_document(@_);}
sub Default {$self->xml_default(@_);}

# This Char function overrides the one in XML::Parser::Stream to overcome a
# problem where $expat->{Text} is treated as the return value, slowing
# things down significantly in some cases.
sub Char {
    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
    $_[0]->{'Text'} .= $_[1];
    return undef;
}

# Called at the beginning of the XML document.
sub xml_start_document {
    my $self = shift(@_);
    my ($expat) = @_;

    $self->open_document();
}

# Called for XML declarations
sub xml_xmldecl {
    my $self = shift(@_);
    my ($expat, $version, $encoding, $standalone) = @_;
}

# Called for XML entities
sub xml_entity {
  my $self = shift(@_);
  my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
}

# Called for DOCTYPE declarations - use die to bail out if this doctype
# is not meant for this plugin
sub xml_doctype {
    my $self = shift(@_);

    my ($expat, $name, $sysid, $pubid, $internal) = @_;
    die "XMLPlug Cannot process XML document with DOCTYPE of $name";
}


# Called for every start tag. The $_ variable will contain a copy of the
# tag and the %_ variable will contain the element's attributes.
sub xml_start_tag {
    my $self = shift(@_);
    my ($expat, $element) = @_;
}

# Called for every end tag. The $_ variable will contain a copy of the tag.
sub xml_end_tag {
    my $self = shift(@_);
    my ($expat, $element) = @_;
}

# Called just before start or end tags with accumulated non-markup text in
# the $_ variable.
sub xml_text {
    my $self = shift(@_);
    my ($expat) = @_;
}

# Called for processing instructions. The $_ variable will contain a copy
# of the pi.
sub xml_pi {
    my $self = shift(@_);
    my ($expat, $target, $data) = @_;
}

# Called at the end of the XML document.
sub xml_end_document {
    my $self = shift(@_);
    my ($expat) = @_;

    $self->close_document();
}

# Called for any characters not handled by the above functions.
sub xml_default {
    my $self = shift(@_);
    my ($expat, $text) = @_;
}

sub open_document {
    my $self = shift(@_);

    # create a new document
    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
    $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
}

sub close_document {
    my $self = shift(@_);
    my $doc_obj = $self->{'doc_obj'};
    # include any metadata passed in from previous plugins 
    # note that this metadata is associated with the top level section
    $self->extra_metadata ($doc_obj, 
			   $doc_obj->get_top_section(), 
			   $self->{'metadata'});
   
    # do any automatic metadata extraction
    $self->auto_extract_metadata ($doc_obj);
   
    # add an OID
    $doc_obj->set_OID();
    
    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");

    # process the document
    $self->{'processor'}->process($doc_obj);
    
    $self->{'num_processed'} ++;
}

1;




