###########################################################################
#
# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package OAIPlug;

use BasPlug;
use unicode;
use util;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

use XMLPlug;

sub BEGIN {
    @OAIPlug::ISA = ('XMLPlug');
}


my $arguments = 
    [ { 'name' => "process_exp",
	'desc' => "{BasPlug.process_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_process_exp() },
      ];

my $options = { 'name'     => "OAIPlug",
		'desc'     => "{OAIPlug.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments };


sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);

    return bless $self, $class;
}

sub get_default_process_exp {
    my $self = shift (@_);

    return q^(?i)(\.oai)$^;
}

sub get_doctype {
    my $self = shift(@_);
    
    return "OAI-PMH";
}

sub xml_start_document {
    my $self = shift (@_);
    $self->{'in_metadata_node'} = 0;
    $self->{'rawxml'} = "";
}

sub xml_end_document {
}

sub xml_doctype {
    my $self = shift(@_);

    my ($expat, $name, $sysid, $pubid, $internal) = @_;

    # allow the short-lived and badly named "GreenstoneArchive" files to be processed
    # as well as the "Archive" files which should now be created by import.pl
    die "" if ($name !~ /^OAI-PMH$/);

    my $outhandle = $self->{'outhandle'};
    print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
    print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};

}


sub xml_start_tag {
    my $self = shift(@_);
    my ($expat,$element) = @_;

    my %attr_hash = %_;

    my $attr = "";
    map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;

    $self->{'rawxml'} .= "<$element$attr>";

    if ($element eq "metadata") {
	$self->{'in_metadata_node'} = 1;
	$self->{'metadata_xml'} = "";
    }

    if ($self->{'in_metadata_node'}) {
	$self->{'metadata_xml'} .= "<$element$attr>";
    }
}

sub xml_end_tag {
    my $self = shift(@_);
    my ($expat, $element) = @_;

    $self->{'rawxml'} .= "</$element>";

    if ($self->{'in_metadata_node'}) {
	$self->{'metadata_xml'} .= "</$element>";
    }

    if ($element eq "metadata") {
	my $textref = \$self->{'metadata_xml'};
	my $metadata = $self->{'metadata'};
	$self->extract_oai_metadata($textref,$metadata);

	$self->{'in_metadata_node'} = 0;	
    }


}

sub xml_text {
    my $self = shift(@_);
    my ($expat) = @_;

    $self->{'rawxml'} .= $_;

    if ($self->{'in_metadata_node'}) {
	$self->{'metadata_xml'} .= $_;
    }
}




sub read {
    my $self = shift (@_);  
  
    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;

    my $outhandle = $self->{'outhandle'};

    my $filename = $file;
    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;

    return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));

    if ($self->SUPER::read(@_)) {

	# Do encoding stuff
	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
	
	my $url_array = $metadata->{'URL'};
	my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
	
	my $srcdoc_exists = 0;
	my $srcdoc_pos = 0;
	my $filename_dir = &util::filename_head($filename);
	
	for (my $i=0; $i<$num_urls; $i++) {
	    
	    if ($url_array->[$i] !~ m/^(http|ftp):/) {
		
		my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
		
		if (-e $src_filename) {
		    $srcdoc_pos = $i;
		    $srcdoc_exists = 1;
		}
	    }
	}
	
	if ($srcdoc_exists)
	{
	    print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
		if ($self->{'verbosity'}>1);
	    
	    
	    # Make pretty print metadata table stick with src filename
	    my $ppmd_table = $self->{'ppmd_table'};
	    $metadata->{'prettymd'} = [ $ppmd_table ];
	    $self->{'ppmd_table'} = undef;
	    
	    return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
				  $metadata, $processor, $maxdocs, $total_count, $gli);
	}
	else
	{
	    # create a new document
	    my $doc_obj = new doc ($filename, "indexed_doc");
	    my $top_section = $doc_obj->get_top_section;
	    my $plugin_type = $self->{'plugin_type'};
	    
	    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
	    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
	    $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
	    $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
	    $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
	    
	    # include any metadata passed in from previous plugins 
	    # note that this metadata is associated with the top level section
	    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	    
	    # do plugin specific processing of doc_obj
	    my $textref = \$self->{'rawxml'};
	    unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
		print STDERR "<ProcessingError n='$file'>\n" if ($gli);
		return -1;
	    }
	    
	    # do any automatic metadata extraction
	    $self->auto_extract_metadata ($doc_obj);
	    
	    # add an OID
	    $doc_obj->set_OID();
	    
	    my $ppmd_table = $self->{'ppmd_table'};
	    $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
	    $self->{'ppmd_table'} = undef;
	    
	    # process the document
	    $processor->process($doc_obj);
	    
	    $self->{'num_processed'} ++;
	    
	    return 1; # processed the file
	}
    }
    else {
	return undef;
    }
}


# do plugin specific processing of doc_obj
sub process {
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    my $outhandle = $self->{'outhandle'};

    print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
    print $outhandle "OAIPlug: processing $file\n"
	if $self->{'verbosity'} > 1;

    my $cursection = $doc_obj->get_top_section();

##    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);

    # add text to document object

#    $$textref =~ s/<(.*?)>/$1 /g;
    $$textref =~ s/</&lt;/g;
    $$textref =~ s/>/&gt;/g;

##    print STDERR "*** adding text: $$textref\n";
    
    $doc_obj->add_utf8_text($cursection, $$textref);

    return 1;
}


# Improvement is to merge this with newer version in MetadataPass

sub open_prettyprint_metadata_table
{
    my $self = shift(@_);

    my $att   = "width=100% cellspacing=2";
    my $style = "style=\'border-bottom: 4px solid #000080\'";

    $self->{'ppmd_table'} = "\n<table $att $style>";
}

sub add_prettyprint_metadata_line 
{
    my $self = shift(@_);
    my ($metaname, $metavalue_utf8) = @_;

    $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);

    $self->{'ppmd_table'} .= "  <tr bgcolor=#b5d3cd>\n";
    $self->{'ppmd_table'} .= "    <td width=30%>\n";
    $self->{'ppmd_table'} .= "      $metaname\n";
    $self->{'ppmd_table'} .= "    </td>\n";
    $self->{'ppmd_table'} .= "    <td>\n";
    $self->{'ppmd_table'} .= "      $metavalue_utf8\n";
    $self->{'ppmd_table'} .= "    </td>\n";
    $self->{'ppmd_table'} .= "  </tr>\n";

}

sub close_prettyprint_metadata_table
{
    my $self = shift(@_);

    $self->{'ppmd_table'} .= "</table>\n";
}




sub extract_oai_metadata {
    my $self = shift (@_);
    my ($textref, $metadata) = @_;
    my $outhandle = $self->{'outhandle'};

    # Only handles DC metadata

    $self->open_prettyprint_metadata_table();

    if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
    {
	my $metadata_text = $1;
	$metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;

	while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
	{
	    # if URL given for document as identifier metadata, store it ...
	    # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);

	    my $metaname = $1;
	    my $metavalue = $2;
	    $metadata_text = $4;
	    
	    $metaname =~ s/^(dc:)?(.)/\u$2/;

	    if ($metaname eq "Identifier")
	    {
		# name clashes with GSDL reserved metadata name for hash id
		$metaname = "URL";
	    }

	    if (defined $metadata->{$metaname})
	    {
		push(@{$metadata->{$metaname}},$metavalue);

	    }
	    else
	    {
		$metadata->{$metaname} = [ $metavalue ];
	    }

	    $self->add_prettyprint_metadata_line($metaname, $metavalue);
	    
	}
    }

    $self->close_prettyprint_metadata_table();
}

1;
