###########################################################################
#
# MARCXMLPlug.pm
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2001 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# Processes MARCXML documents. Note that this plugin does no
# syntax checking (though the XML::Parser module tests for
# well-formedness).

package MARCXMLPlug;

use XMLPlug;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

sub BEGIN {
    @MARCXMLPlug::ISA = ('XMLPlug');
}

my $arguments = [{'name' => "metadata_mapping_file",
		  'desc' => "{MARCXMLPlug.metadata_mapping_file}",
		  'type' => "string",
		  'reqd' => "no" }];

my $options = { 'name'     => "MARCXMLPlug",
		'desc'     => "{MARCXMLPlug.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments 
		};

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    
    my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
    
    $self->{'content'} = "";
    $self->{'record_count'} = 1;
    $self->{'language'} = "";
    $self->{'encoding'} = "";
    $self->{'marc_mapping'} = {};
    $self->{'current_code'} = "";
    $self->{'current_tag'} = "";
    $self->{'current_element'} = "";
    $self->{'metadata_mapping'} = undef;
    $self->{'num_processed'} = 0;
    $self->{'indent'} = 0;

    return bless $self, $class;
}

sub get_doctype {
    my $self = shift(@_);
    
    return "collection";
}


sub init {
    my $self = shift (@_);
    my ($verbosity, $outhandle, $failhandle) = @_;
    
    ## the mapping file has already been loaded
    if (defined $self->{'metadata_mapping'} ){ 
	$self->SUPER::init(@_);
	return;
    }

    my $metadata_mapping = {};

    # read in the metadata mapping file
    my $mm_file = $self->{'metadata_mapping_file'}; 

    if (! defined $mm_file or $mm_file eq ""){
      	$mm_file = &util::filename_cat( $ENV{'GSDLHOME'}, "etc","marctodc.txt" );
	$self->{'metadata_mapping_file'} = $mm_file;
    }

    if (!-e $mm_file)
    {
	my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" .
	    $self->{'metadata_mapping'} . "\".\n    This file should be at $mm_file\n" .
		"    No marc files can be processed.\n";

	print $outhandle $msg;
	print $failhandle $msg;
	$self->{'metadata_mapping'} = undef;
	# We pick up the error in process() if there is no $mm_file
	# If we exit here, then pluginfo.pl will exit too!
    }
    elsif (open(MMIN, "<$mm_file"))
    {
	my $l=1;
	my $line;
	while (defined($line=<MMIN>))
	{
	    chomp $line;
	    if ($line =~ m/^(\d+\w?)\s*->\s*([\w\^]+)$/)
	    {
		my $marc_info = $1;
		my $gsdl_info = $2;
		$metadata_mapping->{$marc_info} = $gsdl_info;
	    }
	    elsif ($line !~ m/^\#/       # allow comments (# in first column)
		   && $line !~ m/^\s*$/) # allow blank lines
	    {
		print $outhandle "Parse error on line $l of $mm_file:\n";
		print $outhandle "  \"$line\"\n";
	    }
	    $l++
	}
	close(MMIN);
    }
    else
    {
	print STDERR "Unable to open $mm_file: $!\n";
    }

    $self->{'metadata_mapping'} = $metadata_mapping;

    ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;

    $self->SUPER::init(@_);
}

# Called for DOCTYPE declarations - use die to bail out if this doctype
# is not meant for this plugin
sub xml_doctype {
    my $self = shift(@_);

    my ($expat, $name, $sysid, $pubid, $internal) = @_;
   return;

}


sub xml_start_document {
    my $self = shift(@_);

    my ($expat, $name, $sysid, $pubid, $internal) = @_;

      
    my $file = $self->{'file'};
    my $filename = $self->{'filename'};
       
    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);

    $self->{'language'} = $language;
    $self->{'encoding'} = $encoding;
    $self->{'element_count'} = 1;
    $self->{'indent'} = 0;
    my $outhandle = $self->{'outhandle'};
    print $outhandle "MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
    print STDERR "<Processing n='$self->{'file'}' p='MARCXMLPlug'>\n" if $self->{'gli'};
 
}

sub xml_end_document {

}

sub xml_start_tag {
    my $self = shift;
    my $expat = shift;
    my $element = shift;  

    my $text =  $self->escape_text($_); 
  
    $self->{'current_element'} = $element;

    ##get all atributes of this element and store it in a map name=>value    
    my %attr_map = (); 
    my $attrstring = $_;
    while ($attrstring =~ /(\w+)=\"(\w+)\"/){
	$attr_map{$1}=$2;
	$attrstring = $'; #'
    }

  
    my $processor = $self->{'processor'};
  
    ##create a new document for each record 
    if ($element eq "record") {
        my $filename = $self->{'filename'};
	my $language = $self->{'language'};
        my $encoding = $self->{'encoding'};
	my $file = $self->{'file'};
	my $doc_obj = new doc($filename);
	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
	my ($filemeta) = $file =~ /([^\\\/]+)$/;
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
        if ($self->{'cover_image'}) {
	    $self->associate_cover_image($doc_obj, $filename);
	}
	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");

	$doc_obj->set_OID();
	$self->set_OID($doc_obj, $doc_obj->get_OID() , $self->{'record_count'});

	my $outhandle = $self->{'outhandle'};
	print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;

        $self->{'record_count'}++;
        $self->{'doc_obj'} = $doc_obj;       
	$self->{'num_processed'}++;

    }
    
    ## get the marc code, for example 520
     if ($element eq "datafield") {
    	 if (defined $attr_map{'tag'} and $attr_map{'tag'} ne ""){
	     $self->{'current_tag'} = $attr_map{tag};  
	 }
     }


    ## append the subcode to the marc code for example 520a or 520b 
    if ($element eq "subfield"){
   	if (defined $attr_map{'code'} and $attr_map{'code'} ne "" and $self->{'current_tag'} ne ""){
	    $self->{'current_code'} = $attr_map{'code'};
	}
    }

   if ($element eq "record"){
        $self->{'indent'} = 0;
    }
    else {
         if ($element ne "subfield"){
              $self->{'indent'} = 1;
         }
         else{
           $self->{'indent'} = 2;
         }
    }
    
    
   if ($element ne "collection"){
        $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$text;
   }
}



sub xml_end_tag {
    my $self = shift(@_);
    my ($expat, $element) = @_;
    my $text =  $self->escape_text($_); 
 
    if ($element eq "record" and defined $self->{'doc_obj'}) {
	# process the document
	my $processor = $self->{'processor'};
	my $doc_obj = $self->{'doc_obj'};
        $self->{'content'} .= "<br/>".$text;
      
	$doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'});
        $processor->process($doc_obj);

        ##clean up
	$self->{'content'} = "";  
	$self->{'doc_obj'} = undef;
        return;
    }

    ## map the xmlmarc to gsdl metadata
    if ($element eq "datafield" and defined $self->{'doc_obj'} and defined $self->{'marc_mapping'}){
	my $metadata_mapping = $self->{'metadata_mapping'};
	my $marc_mapping = $self->{'marc_mapping'};
	my $doc_obj = $self->{'doc_obj'};

       ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping;
       ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping;

	foreach my $marc_field (keys %$metadata_mapping){
	    my ($meta_name,$meta_value);
	    my $matched_field = $marc_mapping->{$marc_field}; 
	    my $subfield = undef;
	    if (defined $matched_field){
		## test whether this field has subfield
		if ($marc_field =~ /\d\d\d(\w)/){
		    $subfield = $1;
		}
		$meta_name = $metadata_mapping->{$marc_field};

		if (defined $subfield){
		    my %mapped_subfield = {@$matched_field};
		    $meta_value = $mapped_subfield{$subfield};
		}
		else{ ## get all values 
		    my $i =0;
		    foreach my $value (@$matched_field){
			if ($i%2 != 0){
			    $meta_value .= $value." ";
			}
			$i++;
		    }
		}
		
		## escape [ and ]
		$meta_value =~ s/\[/\\\[/g;
		$meta_value =~ s/\]/\\\]/g;
		##print STDERR  "$meta_name=$meta_value\n";
		$doc_obj->add_utf8_metadata($doc_obj->get_top_section(),$meta_name, $meta_value);
		
	    }		    
			
	}

	##clean up
	$self->{'marc_mapping'} = undef;
	$self->{'current_tag'} = "";
    }
  
   if ($element eq "datafield"){
       $self->{'indent'} = 1;
       $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$text;
   }
    else{
	$self->{'content'} .= $text;   
    }
     
}


sub set_OID {
    my $self = shift (@_);
    my ($doc_obj, $id, $record_number) = @_;
    
    $doc_obj->set_OID($id . "r" . $record_number);
}

sub xml_text {
    my $self = shift(@_);
    my ($expat) = @_;


    ## store the text of a marc code, for exapmle 520a=>A poem about....
    if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){
	##stored it in the marc_mapping 
     	push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$self->{'current_code'});
	push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$_);
	$self->{'current_code'} = "";
    }
    
    $self->{'content'} .=$self->escape_text($_);
   
}

sub calculate_indent{
   my ($self,$num) = @_;

   my $indent ="";
  
   for (my $i=0; $i<$num;$i++){
       $indent .= "&nbsp;&nbsp;&nbsp;&nbsp;";
    } 
 
   return $indent;

}

sub escape_text {
    my ($self,$text) = @_;
    # special characters in the xml encoding
    $text =~ s/&/&amp;/g; # this has to be first...
    $text =~ s/</&lt;/g;
    $text =~ s/>/&gt;/g;
    $text =~ s/\"/&quot;/g;

    return $text;
}


1;


