###########################################################################
#
# WebDownload.pm -- base class for all the import plugins
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package OAIDownload;

eval {require bytes};

# suppress the annoying "subroutine redefined" warning that various
# plugins cause under perl 5.6
$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};

use strict;

use WgetDownload;
use XMLParser;

use POSIX qw(tmpnam);
use util;

sub BEGIN {
    @OAIDownload::ISA = ('WgetDownload');
}

my $arguments = 
    [ { 'name' => "url", 
	'disp' => "{OAIDownload.url_disp}",
	'desc' => "{OAIDownload.url}",
	'type' => "string",
	'reqd' => "yes"},
      { 'name' => "set", 
	'disp' => "{OAIDownload.set_disp}",
	'desc' => "{OAIDownload.set}",
	'type' => "string",
	'reqd' => "no"},
      { 'name' => "get_doc",
	'disp' => "{OAIDownload.get_doc_disp}",
	'desc' => "{OAIDownload.get_doc}",
	'type' => "flag",
	'reqd' => "no"},
      { 'name' => "max_records", 
	'disp' => "{OAIDownload.max_records_disp}",
	'desc' => "{OAIDownload.max_records}",
	'type' => "int",
	'deft' => "500",
	'range' => "1,",
	'reqd' => "no"} ];

my $options = { 'name'     => "OAIDownload",
		'desc'     => "{OAIDownload.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments };

my $self;

my $strWgetOptions="";

sub new 
{
    my ($class) = shift (@_);
    my ($getlist,$inputargs,$hashArgOptLists) = @_;
    push(@$getlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);

    if ($self->{'info_only'}) {
	# don't worry about any options etc
	return bless $self, $class;
    }

    my $parser = new XML::Parser('Style' => 'Stream',
				 'Handlers' => {'Char' => \&Char,
						'Start' => \&OAI_StartTag,
						'End' => \&OAI_EndTag
						});
    $self->{'parser'} = $parser;
    
    return bless $self, $class;
}

sub download
{
    my ($self) = shift (@_);
    my ($hashGeneralOptions) = @_;

    print STDERR "here2";
   
    $strWgetOptions = $self->getWgetOptions();
    my $cmdWget = $strWgetOptions;
  
    my $strOutputDir ="";
    $strOutputDir = $hashGeneralOptions->{"cache_dir"};
    my $strBasURL = $self->{'url'};
    my $intMaxRecords = $self->{'max_records'};
    my $blnDownloadDoc = $self->{'get_doc'};

    print STDERR "<<Defined Maximum>>\n";

    my $strIDs = $self->getOAIIDs($strBasURL);
 
   if($strIDs eq "")
    {
	print STDERR "Error: No ID being found\n";
	return 0;
    }
    my $aryIDs = $self->parseOAIIDs($strIDs);
    my $intIDs = 0;
    if($self->{'max_records'} < scalar(@$aryIDs))
    {
	$intIDs = $self->{'max_records'};
    }
    else
    {
	$intIDs = scalar(@$aryIDs);
    }
    print STDERR "<<Total number of record(s):$intIDs>>\n";

    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);

    my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
    &util::rm($tmp_file); 

    return 1;
}

sub getOAIIDs
{
    my ($self,$strBasURL) = @_;
    my ($cmdWget);
     
    my $wgetOptions = $self->getWgetOptions();

    $cmdWget = $wgetOptions;
  
    print STDERR  "Gathering OAI identifiers.....\n";

    if($self->{'set'} ne "")
    {
	$cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" ";
    }
    else
    {
	$cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" ";
    }

 
    my $strIDs =  $self->useWget($cmdWget);

    if (!defined $strIDs or $strIDs eq ""  ){
	print STDERR "Server information is unavailable.\n";
	print STDERR "<<Finished>>\n";
        return;  
    }

    print STDERR "<<Download Information>>\n";

    $self->parse_xml($strIDs);

    return $strIDs;
}

sub parseOAIIDs
{   
    my ($self,$strIDs) = @_;

    print STDERR "Parsing OAI identifiers.....\n";
    $strIDs =~ s/^.*?<identifier>/<identifier>/s;
    $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;

    my @aryIDs = ();

    while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
    {
	$strIDs = $2;
	push(@aryIDs,$1);
    }
    
    return \@aryIDs;
} 

sub dirFileSplit
{
    my ($self,$strFile) = @_;

    my @aryDirs = split("[/\]",$strFile);
   
    my $strLocalFile = pop(@aryDirs);
    my $strSubDirs = join("/",@aryDirs);

    return ($strSubDirs,$strLocalFile);
}

sub getOAIDoc
{
    my ($self,$strRecord, $strSubDirPath) = @_;
 
    print  STDERR "Gathering source documents.....\n";
    # look out for identifier tag in metadata section
   
    if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
    {
	my $strMetaTag = $1;
	
	if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
	{
	    my $strDocURL = $2;

	    my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);

            my $strSoureDirPath ="";

	    $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");

	    &util::mk_dir($strSoureDirPath)  if (!-e "$strSoureDirPath");
	    
	    my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
	    
	    my $wget_cmd = $strWgetOptions." -q -O $strFullDocFilePath \"$strDocURL\"";

	    my $strResponse =  $self->useWget($wget_cmd,1);

	    if($strResponse ne "")
	    {
		print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
		exit(-1);
	    }

	    $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n   <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
	}
	else
	{
	    print  STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
	}
    }
    else
    {
	print  STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
    }
    
}

sub getOAIRecords
{
    my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;

    my $intDocCounter = 0;

    foreach my $strID ( @$aryIDs)
    {
	print  STDERR "Gathering OAI record with ID:$strID.....\n";
	   
	my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\"";

	my $strRecord =  $self->useWget($cmdWget);

        
        my @fileDirs = split(":",$strID);  

	# setup directories

        $strOutputDir  =~ s/"//g; #"

        my $host =$self->{'url'}; 
  
        $host =~ s/http:\/\///g;

        $host =~ s/:.*//g; 

	my $strFileURL = "$strOutputDir/$host/$fileDirs[0]/$fileDirs[1].oai";
	
	# prepare subdirectory for record (if needed)
	my ($strSubDirPath,$unused) = ("", "");

       	($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
   
	&util::mk_all_dir($strSubDirPath);

	my $ds = &util::get_dirsep();
	
	if($blnDownloadDoc)
	{
	    $self->getOAIDoc($strRecord,$strSubDirPath);
	}

	# save record 
	open (OAIOUT,">$strFileURL")
	    || die "Unable to save oai metadata record: $!\n";
	print OAIOUT $strRecord;
	close(OAIOUT);

        print STDERR "Saving records to $strFileURL\n";
        print STDERR "<<Done>>\n";
	$intDocCounter ++;	
	last if ($intDocCounter >= $intMaxRecords);
    }

    ($intDocCounter >= $intMaxRecords) ? 
	print  STDERR "Reach maximum download records, use -max_records to set the maximum.\n": 
	print  STDERR "Complete download meta record from $strBasURL\n";

       print STDERR "<<Finished>>\n";
}

sub url_information
{
    my ($self) = shift (@_);
    if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
    
    my $wgetOptions = $self->getWgetOptions();
    my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
 
    my $strIdentify = "verb=Identify";
    my $strListSets = "verb=ListSets";

    my $strIdentifyCMD = $strBaseCMD;
    $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;	

    my $strIdentifyText = $self->useWget($strIdentifyCMD);

     if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
	print STDERR "Server information is unavailable.\n";
	print STDERR "<<Finished>>\n";
        return;  
    }

    print STDERR "General information:\n";
    $self->parse_xml($strIdentifyText);

    my $strListSetCMD = $strBaseCMD;
    $strListSetCMD =~ s/_OPTS_/$strListSets/;	
    my $strListSetsText = $self->useWget($strListSetCMD);


    print STDERR "List Information:\n";
    $self->parse_xml($strListSetsText);
}

sub parse_xml
{    
    my ($self) = shift (@_);
    my ($strOutputText) = @_;
   
    #Open a temporary file to store OAI information, and store the information to the temp file
    my $name = "$ENV{GSDLHOME}/tmp/oai.tmp"; 

    open(*OAIOUT,"> $name");
	
    print OAIOUT $strOutputText;
    close(OAIOUT);

    $self->{'temp_file_name'} = $name;

    eval {
	$self->{'parser'}->parsefile("$name");
    };
    
    if ($@) {
	die "OAI: $name is not a well formed XML file ($@)\n";
    }
}

END{
    if($self->{'info'})
    {
	unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
    }
}

# This Char function overrides the one in XML::Parser::Stream to overcome a
# problem where $expat->{Text} is treated as the return value, slowing
# things down significantly in some cases.
sub Char {   
    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
    $_[0]->{'Text'} .= $_[1];
    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
	$self->{'text'} .= $_[1];
	$self->{'text'} =~ s/[\n]|([ ]{2,})//g;
	if($self->{'text'} ne "")
	{	    
	    print STDERR " $self->{'subfield'}:($self->{'text'})\n";
	}
    }
    return undef;
}

sub OAI_StartTag
{
    my ($expat, $element, %attr) = @_;

    $self->{'subfield'} = $element;
    
}

sub OAI_EndTag
{
    my ($expat, $element) = @_;
    $self->{'text'} = "";
    $self->{'subfield'} = "";
}

sub error
{
    my ($self,$strFunctionName,$strError) = @_;
    {
	print "Error occoured in OAIDownload.pm\n".
	    "In Function:".$strFunctionName."\n".
	    "Error Message:".$strError."\n";
	exit(-1);
    }
}



1;
