###########################################################################
#
# MetadataXMLPlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2006 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# MetadataXMLPlug process metadata.xml files in a collection

# Here's an example of a metadata file that uses three FileSet structures
# (ignore the # characters):

#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
#<DirectoryMetadata>
#  <FileSet>
#    <FileName>nugget.*</FileName>
#    <Description>
#      <Metadata name="Title">Nugget Point, The Catlins</Metadata>
#      <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
#    </Description>
#  </FileSet>
#  <FileSet>
#    <FileName>nugget-point-1.jpg</FileName>
#    <Description>
#      <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
#      <Metadata name="Subject">Lighthouse</Metadata>
#    </Description>
#  </FileSet>
#  <FileSet>
#    <FileName>kaka-point-dir</FileName>
#    <Description>
#      <Metadata name="Title">Kaka Point, The Catlins</Metadata>
#    </Description>
#  </FileSet>
#</DirectoryMetadata>

# Metadata elements are read and applied to files in the order they appear
# in the file.
#
# The FileName element describes the subfiles in the directory that the
# metadata applies to as a perl regular expression (a FileSet group may
# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
# indicates that the metadata records in the following Description block
# apply to every subfile that starts with "nugget".  For these files, a
# Title metadata element is set, overriding any old value that the Title
# might have had.
#
# Occasionally, we want to have multiple metadata values applied to a
# document; in this case we use the "mode=accumulate" attribute of the
# particular Metadata element.  In the second metadata element of the first
# FileSet above, the "Place" metadata is accumulating, and may therefore be
# given several values.  If we wanted to override these values and use a
# single metadata element again, we could set the mode attribute to
# "override" instead.  Remember: every element is assumed to be in override
# mode unless you specify otherwise, so if you want to accumulate metadata
# for some field, every occurance must have "mode=accumulate" specified.
#
# The second FileSet element above applies to a specific file, called
# nugget-point-1.jpg.  This element overrides the Title metadata set in the
# first FileSet, and adds a "Subject" metadata field.
#
# The third and final FileSet sets metadata for a subdirectory rather than
# a file.  The metadata specified (a Title) will be passed into the
# subdirectory and applied to every file that occurs in the subdirectory
# (and to every subsubdirectory and its contents, and so on) unless the
# metadata is explictly overridden later in the import.

package MetadataXMLPlug;

use strict;
no strict 'refs';
use BasPlug;
use util;
use metadatautil;

sub BEGIN {
    @MetadataXMLPlug::ISA = ('BasPlug');
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
}

use XMLParser;

my $arguments = [
      { 'name' => "block_exp",
	'desc' => "{BasPlug.block_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_block_exp() }
];

my $options = { 'name'     => "MetadataXMLPlug",
		'desc'     => "{MetadataXMLPlug.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments };

my ($self);

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);

    if ($self->{'info_only'}) {
	# don't worry about any options or initialisations etc
	return bless $self, $class;
    }
	
    # create XML::Parser object for parsing metadata.xml files
    my $parser = new XML::Parser('Style' => 'Stream',
				 'Handlers' => {'Char' => \&Char,
						'Doctype' => \&Doctype
						});
    
    $self->{'parser'} = $parser;
    $self->{'in_filename'} = 0;
    
    
    return bless $self, $class;
}


sub get_default_process_exp
{
    return q^metadata\.xml$^;
}

# We don't want any other plugins to see metadata.xml files
# block exp are currently only used in the read bit
sub get_default_block_exp
{
    return q^metadata\.xml$^;
}

sub metadata_read
{
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata,
$processor, $maxdocs, $gli) = @_;

    my $filename = &util::filename_cat($base_dir, $file);
    if ($filename !~ /metadata\.xml$/ || !-f $filename) {
        return undef;
    }

    print STDERR "\n<Processing n='$file' p='MetadataXMLPlug'>\n" if ($gli);
    print STDERR "MetadataXMLPlug: processing $file\n" if ($self->{'verbosity'})> 1;

    $self->{'metadataref'} = $extrametadata;
    $self->{'metakeysref'} = $extrametakeys;
    
    eval {
	$self->{'parser'}->parsefile($filename);
    };

    if ($@) {
	my $outhandle = $self->{'outhandle'};
	my $plugin_name = ref ($self);
	print $outhandle "$plugin_name failed to process $file ($@)\n";
	
	return -1; #error
    }
    return 1;

}

sub Doctype {
    my ($expat, $name, $sysid, $pubid, $internal) = @_;

    # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files 
    # to be processed as well as the "DirectoryMetadata" files which should now
    # be created by import.pl
    die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
}

sub StartTag {
    my ($expat, $element) = @_;

    if ($element eq "FileSet") {
	$self->{'saved_targets'} = [];
	$self->{'saved_metadata'} = {};
    }
    elsif ($element eq "FileName") {
	$self->{'in_filename'} = 1;
    }
    elsif ($element eq "Metadata") {
	$self->{'metadata_name'} = $_{'name'};
	if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
	    $self->{'metadata_accumulate'} = 1;
	} else {
	    $self->{'metadata_accumulate'} = 0;
	}
    }
}

sub EndTag {
    my ($expat, $element) = @_;

    if ($element eq "FileSet") {
	push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
	foreach my $target (@{$self->{'saved_targets'}}) {
	    my $file_metadata = $self->{'metadataref'}->{$target};
	    my $saved_metadata = $self->{'saved_metadata'};
	    if (!defined $file_metadata) {
		$self->{'metadataref'}->{$target} = $saved_metadata;
	    }
	    else {
		$self->combine_metadata_structures($file_metadata,$saved_metadata);
	    }
	}
    }
    elsif ($element eq "FileName") {
	$self->{'in_filename'} = 0;
    }
    elsif ($element eq "Metadata") {
	$self->{'metadata_name'} = "";
    }

}

sub Text {

    if ($self->{'in_filename'}) {
	# $_ == FileName content
	push (@{$self->{'saved_targets'}}, $_);
    }
    elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
	# $_ == Metadata content
	my $mname = $self->{'metadata_name'};
	my $mvalue = $_;
	my $md_accumulate = $self->{'metadata_accumulate'};
	$self->store_saved_metadata($mname,$mvalue,$md_accumulate);
    }
}

# This Char function overrides the one in XML::Parser::Stream to overcome a
# problem where $expat->{Text} is treated as the return value, slowing
# things down significantly in some cases.
sub Char {
    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
    $_[0]->{'Text'} .= $_[1];
    return undef;
}

sub combine_metadata_structures
{
    my $self = shift(@_);

    my ($mdref1, $mdref2) = @_;
    &metadatautil::combine_metadata_structures($mdref1, $mdref2);
}

sub store_saved_metadata
{
    my $self = shift(@_);
    my ($mname,$mvalue,$md_accumulate) = @_;

    if (defined $self->{'saved_metadata'}->{$mname}) {
	if ($md_accumulate) {
	    # accumulate mode - add value to existing value(s)
	    if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
		push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
	    } else {
		$self->{'saved_metadata'}->{$mname} = 
		    [$self->{'saved_metadata'}->{$mname}, $mvalue];
	    }
	} else {
	    # override mode
	    $self->{'saved_metadata'}->{$mname} = $mvalue;
	}
    } else {
	if ($md_accumulate) {
	    # accumulate mode - add value into (currently empty) array
	    $self->{'saved_metadata'}->{$mname} = [$mvalue];
	} else {
	    # override mode
	    $self->{'saved_metadata'}->{$mname} = $mvalue;
	}
    }
}


1;
