###########################################################################
#
# GMLPlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# plugin which processes a GML format document
# assumes that gml tags are all in lower-case.

# 12/05/02 Added usage datastructure - John Thompson

package GMLPlug;

use BasPlug;
use util;
use doc;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

sub BEGIN {
    @GMLPlug::ISA = ('BasPlug');
}

my $arguments =
    [ { 'name' => "process_exp",
	'desc' => "{BasPlug.process_exp}",
	'type' => "regexp",
	'deft' =>  &get_default_process_exp() } 
    ];

my $options = { 'name'     => "GMLPlug",
		'desc'     => "{GMLPlug.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args' => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);

    return bless $self, $class;
}

sub get_default_process_exp {
    my $self = shift (@_);

    return q^(?i)\.gml?$^;
}

# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might 
# include directories
sub read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    my $outhandle = $self->{'outhandle'};

    #check process and block exps, smart block, etc
    my ($block_status,$filename) = $self->read_block(@_);    
    return $block_status if ((!defined $block_status) || ($block_status==0));

    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up

    print STDERR "<Processing n='$file' p='GMLPlug'>\n" if ($gli);
    print $outhandle "GMLPlug: processing $file\n";

    my $parent_dir = $file;
    $parent_dir =~ s/[^\\\/]*$//;
    $parent_dir = &util::filename_cat ($base_dir, $parent_dir);

    if (!open (INFILE, $filename)) {
	if ($gli) {
	    print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
	}
	print $outhandle "GMLPlug::read - couldn't read $filename\n";
	return -1;
    }

    undef $/;
    my $gml = <INFILE>;
    $/ = "\n";
    close (INFILE);
    
    my @gml_sections = split("</gsdlsection>",$gml);
    $gml = shift(@gml_sections);

    my $no_docs = 0;

    while (1) {
	# create a new document
	my $doc_obj = new doc ();
	my $section = $doc_obj->get_top_section();

	# process the document
	my $firstsection = 1;
	while (1) {
	    my ($tags, $text) = ("", "");

	    my @indenting_sections = split("<gsdlsection", $gml);
	    shift(@indenting_sections); # skips over xml header if present

	    foreach $gml (@indenting_sections) {

		if ($gml =~ /^\s*([^>]*)>(.*)$/so) {
		    $tags = $1 if defined $1;
		    $text = &GMLPlug::_unescape_text($2);

		} else {
		    print $outhandle "GMLPlug::read - error in file $filename\n";
		    print $outhandle "text: \"$gml\"\n";
		    last;
		}

		# create the section (unless this is the first section)
		if ($firstsection) {
 		    $firstsection = 0;
#		    $tags =~ /gsdlsourcefilename\s*=\s*(?:\"([^\"]*)\")/o;
#		    $src_filename = $2 || $3;

		} else {

		    $tags =~ s/gsdlnum\s*=\s*\"?(\d+)\"?//o;
		    if (defined $1) {
			$section .= ".$1";
			$doc_obj->create_named_section($section);
		    } else {
			$section = $doc_obj->insert_section($doc_obj->get_end_child($section));
		    }
		}
	    
		# add the metadata
		# could be stored as either attributes or ....
		while ((defined $tags) && ($tags =~ s/^\s*(\S+)=\"([^\"]*)\"//o)) {
		    $doc_obj->add_utf8_metadata($section, $1, &GMLPlug::_unescape_text($2)) 
			if (defined $1 and defined $2);

		}
		
		# ... or tags (xml compliant)
		if ($text =~ m/^\s*<metadata>/)
		{
		    my ($metadata, $tagname, $tagvalue);
		    ($metadata,$text) 
			= ($text =~ m/\s*<metadata>\s*(<.*)\s*<\/metadata>(.*)$/s);

		    # note: \1 refers to 1st match within regexp, so we can
		    # handle the unescaped text here...
		    while ((defined $metadata) 
			   && ($metadata =~ s/<(.*?)>(.*?)<\/\1>//s))
		    {
			if (defined $1 && defined $2) 
			{
			    $tagname = $1;
			    $tagvalue = $2;
			    
			    # if tagname begins with '/' it will be escaped
			    $tagname =~ s/^&\#47;/\//;
			    
			    $doc_obj->add_utf8_metadata($section, $tagname, &GMLPlug::_unescape_text($tagvalue)); 
			}
		    }
		}

		# add the text

		$doc_obj->add_utf8_text($section, $text) 
		    if ((defined $text) && ($text ne ""));		
	    }

	    $gml = shift(@gml_sections); # get next bit of data
	    last unless defined $gml;
 	    last if $section eq ""; # back to top level again (more than one document in gml file)
	    $section = $doc_obj->get_parent_section ($section);
	} # while (1) section level
	
	# add the FileFormat as the metadata
	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");

	# add the associated files
	my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
	my ($assoc_file_info);
	
	foreach $assoc_file_info (@$assoc_files) 
	{
	    my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
	    my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
	    my $assoc_dir = (defined $dir && $dir ne "") 
		? &util::filename_cat($dir, $assoc_file) : $assoc_file;
	    $doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
	    
	}
	$doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");

	# add metadata passed in from elsewhere
	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	
	# do any automatic metadata extraction
	$self->auto_extract_metadata ($doc_obj);

	# assume the document has an OID already

	# process the document
	$processor->process($doc_obj, $file);
	
	$no_docs++;
	last if ($maxdocs > -1 && ($total_count+$no_docs) >= $maxdocs);
	last unless defined $gml && $gml =~ /\w/;
    } # while(1) document level
    
    return $no_docs; # no of docs processed
}

sub _unescape_text {
    my ($text) = @_;

    # special characters in the gml encoding
    $text =~ s/&lt;/</g;
    $text =~ s/&gt;/>/g;
    $text =~ s/&quot;/\"/g;
    $text =~ s/&amp;/&/g; # this has to be last...

    return $text;
}

1;
