#!/usr/bin/perl -w

###########################################################################
#
# export.pl --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# This program will export a particular collection into a specific Format (e.g. METS or DSpace)
# Author: Chi-Yu Huang Date: 08-10-2004

package export;

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
}

use strict;
no strict 'refs'; # allow filehandles to be variables and vice versa
no strict 'subs'; # allow barewords (eg STDERR) as function arguments

use arcinfo;
use colcfg;
use plugin;
use plugout;
use util;
use scriptutil;
use FileHandle;
use gsprintf;
use printusage;
use parse2;


my $oidtype_list = 
    [ { 'name' => "hash",
        'desc' => "{import.OIDtype.hash}" },
      { 'name' => "incremental",
        'desc' => "{import.OIDtype.incremental}" },
      { 'name' => "assigned",
        'desc' => "{import.OIDtype.assigned}" },
      { 'name' => "dirname",
        'desc' => "{import.OIDtype.dirname}" } ];

#** define to use the METS format or DSpace format
my $saveas_list = 
    [ { 'name' => "DSpace",
        'desc' => "{export.saveas.DSpace}" },
      { 'name' => "METS",
        'desc' => "{export.saveas.METS}"},
      { 'name' => "GA",
        'desc' => "{export.saveas.GA}"},
      { 'name' => "MARCXML",
        'desc' => "{export.saveas.MARCXML}"}
     ];


# Possible attributes for each argument
# name: The name of the argument
# desc: A description (or more likely a reference to a description) for this argument
# type: The type of control used to represent the argument. Options include: string, int, flag, regexp, metadata, language, enum etc
# reqd: Is this argument required?
# hiddengli: Is this argument hidden in GLI?
# modegli: The lowest detail mode this argument is visible at in GLI

my $saveas_argument =       
      { 'name' => "saveas",
	'desc' => "{export.saveas}",
	'type' => "enum",
	'list' => $saveas_list,
	'deft' => "METS",
	'reqd' => "no",
	'modegli' => "3" };


my $arguments = 
    [ 
      $saveas_argument,
      { 'name' => "saveas_version",
	'desc' => "{export.saveas_version}",
	'type' => "string",
	'reqd' => "no",
       'deft' => "greenstone" },
      { 'name' => "exportdir",
	'desc' => "{export.exportdir}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "importdir",
	'desc' => "{import.importdir}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "collectdir",
	'desc' => "{export.collectdir}",
	'type' => "string",
	'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "listall",
	'desc' => "{export.listall}",
	'type' => "flag",
	'reqd' => "no" },
      { 'name' => "debug",
	'desc' => "{export.debug}",
	'type' => "flag",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "faillog",
	'desc' => "{export.faillog}",
	'type' => "string",
	'deft' => "",
	'reqd' => "no",
        'modegli' => "4" },
      { 'name' => "keepold",
	'desc' => "{export.keepold}",
	'type' => "flag",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "removeold",
	'desc' => "{export.removeold}",
	'type' => "flag",
	'reqd' => "no",
	'modegli' => "3" },
      { 'name' => "language",
	'desc' => "{scripts.language}",
	'type' => "string",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "maxdocs",
	'desc' => "{export.maxdocs}",
	'type' => "int",
	'reqd' => "no",
	'range' => "1,",
	'modegli' => "1" },
      { 'name' => "OIDtype",
	'desc' => "{import.OIDtype}",
	'type' => "enum",
	'list' => $oidtype_list,
	# parsearg left "" as default
	#'deft' => "hash",
	'reqd' => "no",
	'modegli' => "3" },
      { 'name' => "OIDmetadata",
	'desc' => "{import.OIDmetadata}",
	'type' => "metadata",
	'deft' => "dc.Identifier",
	'reqd' => "no",
	'modegli' => "3" },
      { 'name' => "out",
	'desc' => "{export.out}",
	'type' => "string",
	'deft' => "STDERR",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "statsfile",
	'desc' => "{export.statsfile}",
	'type' => "string",
	'deft' => "STDERR",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "xsltfile",
	'desc' => "{BasPlugout.xslt_file}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "xslt_txt",
	'desc' => "{METSPlugout.xslt_txt}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "no" },
      { 'name' => "xslt_mets",
	'desc' => "{METSPlugout.xslt_mets}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "no" },
      { 'name' => "mapping_file",
	'desc' => "{MARCXMLPlugout.mapping_file}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "no" },
      { 'name' => "group_marc",
	'desc' => "{MARCXMLPlugout.group}",
	'type' => "flag",
	'reqd' => "no",
        'hiddengli' => "no" },
      { 'name' => "verbosity",
	'desc' => "{export.verbosity}",
	'type' => "int",
	'range' => "0,3",
	'deft' => "2",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "gli",
	'desc' => "",
	'type' => "flag",
	'reqd' => "no",
	'hiddengli' => "yes" },
      { 'name' => "xml",
	'desc' => "{scripts.xml}",
	'type' => "flag",
	'reqd' => "no",
	'hiddengli' => "yes" }
      ];

my $options = { 'name' => "export.pl",
		'desc' => "{export.desc}",
		'args' => $arguments };

my $listall_options = { 'name' => "export.pl",
		        'desc' => "{export.desc}",
		        'args' => [ $saveas_argument ] };

sub gsprintf
{
    return &gsprintf::gsprintf(@_);
}


&main();

sub main {
    # params
    my ($language, $verbosity, $importdir, $exportdir, $keepold, $listall,
	$removeold, $saveas, $saveas_version, $debug, $OIDtype, $OIDmetadata,
	$maxdocs, $statsfile, $xsltfile, $mapping_file, $out, $faillog, 
	$collectdir, $gli,$xslt_mets,$xslt_txt,$group_marc);
    my $xml = 0;
    
    # other vars
    my ($configfilename, $collection, $export_info_filename, $export_info, $processor, $pluginfo);
    my $service = "export";

    my $hashParsingResult = {};
    # general options available to all plugins
    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
    
    # If parse returns -1 then something has gone wrong
    if ($intArgLeftinAfterParsing == -1)
    {
	&PrintUsage::print_txt_usage($options, "{export.params}");
	die "\n";
    }

    foreach my $strVariable (keys %$hashParsingResult)
    {
	eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
    }

    
    # these are options used by other things - we just set default values
    # undef means will be set from config file if there
    my $gzip = undef; 
    my $groupsize = 1;
    #my $OIDtype = undef;
    my $sortmeta = undef;

    my $explicit_exportdir = (defined $exportdir) ? 1 : 0;

    # save these command line settings. don't want config file settings in one
    # coll used for other colls
    # does this apply to other vars???
    my $global_removeold = $removeold;
    my $global_keepold = $keepold;
    # If $language has been specified, load the appropriate resource bundle
    # (Otherwise, the default resource bundle will be loaded automatically)
    if ($language) {
	&gsprintf::load_language_specific_resource_bundle($language);
    }

    if ($listall) {
	if ($xml) {
	    &PrintUsage::print_xml_usage($listall_options);
	}
	else
	{
	    &PrintUsage::print_txt_usage($listall_options,"{export.params}");
	}
	die "\n";
    }
    elsif ($xml) {
        &PrintUsage::print_xml_usage($options);
	die "\n";
    }

    # can have more than one collection name,  
    # if the first extra option is -h, then output the help
    if (scalar(@ARGV) == 0 || (@ARGV && $ARGV[0] =~ /^\-+h/)) {
	&PrintUsage::print_txt_usage($options, "{export.params}");
	die "\n";
    }

    if ($gli) { # the gli wants strings to be in UTF-8
	&gsprintf::output_strings_in_UTF8; 
    }
    my $close_out = 0;
    if ($out !~ /^(STDERR|STDOUT)$/i) {
	open (OUT, ">$out") ||
	    (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
	$out = 'export::OUT';
	$close_out = 1;
    }
    $out->autoflush(1);

    while (scalar(@ARGV)>0) {
	my $collect_name = shift @ARGV;
	$ENV{'GSDLCOLLECTION'} = $collect_name;

	eval {
	    # get and check the collection name
	    if (($collection = &util::use_collection($collect_name, $collectdir)) eq "") {
		&PrintUsage::print_txt_usage($options, "{export.params}");
		die "\n";
	    }
	    # add collection's perllib dir  into include path in 
	    # case we have collection specific modules
	    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
	    
	    if ($faillog eq "") {
		$faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
	    }
	    open (FAILLOG, ">$faillog") ||
		(&gsprintf(STDERR, "{export.cannot_open_fail_log}\n", $faillog) && die);
	    my $faillogname = $faillog;
	    $faillog = 'export::FAILLOG';
	    $faillog->autoflush(1);
	    
	    # check sortmeta
	    $sortmeta = undef unless defined $sortmeta && $sortmeta =~ /\S/;
	    if (defined $sortmeta && $groupsize > 1) {
		&gsprintf($out, "{export.cannot_sort}\n\n");
		$sortmeta = undef;
	    }
	    
	    # get the list of plugins for this collection and set any options that
	    # were specified in the collect.cfg (all export.pl options except
	    # -collectdir, -out and -faillog may be specified in the collect.cfg (these
	    # options must be known before we read the collect.cfg))
	    my $plugins = [];
	    my @global_opts = ();
	    
	    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
	    if (!-e $configfilename) {
		(&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die);
	    }
	    
	    my $collectcfg = &colcfg::read_collect_cfg ($configfilename);
	    if (defined $collectcfg->{'plugin'}) {
		$plugins = $collectcfg->{'plugin'};
	    }
	    
	    if ($verbosity !~ /\d+/) {
		if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
		    $verbosity = $collectcfg->{'verbosity'};
		} else {
		    $verbosity = 2; # the default
		}
	    }
	    if (defined $collectcfg->{'importdir'} && $importdir eq "") {
		$importdir = $collectcfg->{'importdir'};
	    }
	    if (defined $collectcfg->{'exportdir'} && $exportdir eq "") {
		$exportdir = $collectcfg->{'exportdir'};
	    }

	    if (defined $collectcfg->{'gzip'} && !$gzip) {
		if ($collectcfg->{'gzip'} =~ /^true$/i) {
		    $gzip = 1;
		}
	    }
	    if ($maxdocs !~ /\-?\d+/) {
		if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
		    $maxdocs = $collectcfg->{'maxdocs'};
		} else {
		    $maxdocs = -1; # the default
		}
	    }
	    if ($groupsize == 1) {
		if (defined $collectcfg->{'groupsize'} && $collectcfg->{'groupsize'} =~ /\d+/) {
		    $groupsize = $collectcfg->{'groupsize'};
		}
	    }
	    if (!defined $OIDtype || ($OIDtype !~ /^(hash|incremental|assigned|dirname)$/)) {
		if (defined $collectcfg->{'OIDtype'} && $collectcfg->{'OIDtype'} =~ /^(hash|incremental)$/) {
		    $OIDtype = $collectcfg->{'OIDtype'};
		} else {
		    $OIDtype = "hash"; # the default
		}
	    }
	    if (defined $collectcfg->{'sortmeta'} && $sortmeta eq "") {
		$sortmeta = $collectcfg->{'sortmeta'};
	    }
	    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
		$debug = 1;
	    }
	    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
		$gli = 1;
	    }
	    
	    # global plugin stuff
	    if (defined $collectcfg->{'separate_cjk'}&& $collectcfg->{'separate_cjk'} =~ /^true$/i) {
		push @global_opts, "-separate_cjk";
	    }
	    
	    ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($global_removeold, $global_keepold, 0, "export", $collectcfg);

	    $gli = 0 unless defined $gli;

	    print STDERR "<export>\n" if $gli;
	    
	    # fill in the default import and export directories if none
	    # were supplied, turn all \ into / and remove trailing /
	    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
	    $importdir =~ s/[\\\/]+/\//g;
	    $importdir =~ s/\/$//;
	    $exportdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "export") if $exportdir eq "";
	    $exportdir =~ s/[\\\/]+/\//g;
	    $exportdir =~ s/\/$//;
	    
	    # load all the plugins
	    $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
	    
	    if (scalar(@$pluginfo) == 0) {
		&gsprintf($out, "{import.no_plugins_loaded}\n");
		die "\n";
	    }
	    
	    # remove the old contents of the export directory if needed
	    if ($removeold && -e $exportdir) {
		&gsprintf($out, "{export.removing_export}\n");
		&util::rm_r ($exportdir);
	    }
	    
	    # read the export information file
	    
	    # Export to DSpace Archive format or METs format
	    # If saveas=DSpace, a "contents" file will be created, otherwise "export.inf"
		
	    # the plugouts should be doing this!!
	    if ($saveas eq "DSpace"){
		$export_info_filename = &util::filename_cat ($exportdir, "contents");
	    } elsif ($saveas eq "METS" || $saveas eq "GA" || $saveas eq "MARC" ) {
		$export_info_filename = &util::filename_cat ($exportdir, "export.inf");
	    }
	    
	    $export_info = new arcinfo();
	    $export_info -> load_info ($export_info_filename);	
	    
	    if ($saveas !~ /^(GA|METS|DSpace|MARCXML)$/) {
		if (defined $collectcfg->{'plugout'} && $collectcfg->{'plugout'}[0] =~ /^(GAPlugout|METSPlugout)$/) {
		    $saveas = $collectcfg->{'plugout'}[0];
		} else {
		    $saveas ="GAPlugout";
		}
	    }
	    
	    
	    ####Use Plugout####
	    my ($plugout_name);
	    if ($saveas !~ /^(GA|METS|DSpace|MARCXML)Plugout$/  ){      
		$plugout_name = $saveas."Plugout";
	    }
	    else {
		$plugout_name = $saveas;
	    } 
	    
	    my $opts=[];
	    
	    push @$opts,("-output_info",$export_info) if (defined $export_info); 
	    push @$opts,("-verbosity",$verbosity) if (defined $verbosity);
	    push @$opts,("-debug") if ($debug);
	    push @$opts,("-gzip_output",$gzip) if (defined $gzip);
	    push @$opts,("-group_size",$groupsize) if (defined $groupsize);
	    push @$opts,("-output_handle",$out) if (defined $out);
	    push @$opts,("-xslt_file",$xsltfile) if (defined $xsltfile);
	    push @$opts,("-group") if ($group_marc && $plugout_name =~ /^MARCXMLPlugout$/);
	    push @$opts,("-mapping_file",$mapping_file) if (defined $mapping_file && $plugout_name =~ /^MARCXMLPlugout$/);
	    push @$opts,("-saveas_version",$saveas_version) if (defined $saveas_version && $plugout_name =~ /^METSPlugout$/);
	    push @$opts,("-xslt_mets",$xslt_mets) if (defined $xslt_mets && $plugout_name =~ /^METSPlugout$/);
	    push @$opts,("-xslt_txt",$xslt_txt) if (defined $xslt_txt && $plugout_name =~ /^METSPlugout$/);
	    $processor = &plugout::load_plugout($plugout_name,$opts);    
	    
	    $processor->setoutputdir ($exportdir);
	    
	    $processor->set_sortmeta ($sortmeta) if defined $sortmeta;
	    $processor->set_OIDtype ($OIDtype, $OIDmetadata);
	    
	    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs);
	    
	    # process the import directory
	    &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs,0, $gli);
	    
	    &plugin::end($pluginfo, $processor);

 	    &plugin::deinit($pluginfo, $processor);
	    
	    # write out the export information file
	    $processor->close_file_output() if $groupsize > 1;
	    $processor->close_group_output() if $processor->is_group();
	    if ($saveas eq "METS") {
		$export_info->save_info($export_info_filename);
	    }
	    
	    # write out export stats
	    my $close_stats = 0;
	    if ($statsfile !~ /^(STDERR|STDOUT)$/i) {
		if (open (STATS, ">$statsfile")) {
		    $statsfile = 'import::STATS';
		    $close_stats = 1;
		} else {
		    &gsprintf($out, "{import.cannot_open_stats_file}", $statsfile);
		    &gsprintf($out, "{import.stats_backup}\n");
		    $statsfile = 'STDERR';
		}
	    }

	    &gsprintf($out, "\n");
	    &gsprintf($out, "*********************************************\n");

	    &plugin::write_stats($pluginfo, $statsfile, $faillogname, $gli);
	    if ($close_stats) {
		close STATS;
	    }

	    &gsprintf($out, "*********************************************\n");

	    close OUT if $close_out;

	    close FAILLOG;
	};

	if ($@) {
	    print STDERR $@;
	}

##	$ENV{'GSDLCOLLECTION'} = undef;
	$importdir = "";
	$removeold = 0 if ($explicit_exportdir);

    } # while processing ARGV
    
    &gsprintf($out, "\n");
    &gsprintf($out, "*********************************************\n");
    &gsprintf($out, "* {export.complete}\n");
    &gsprintf($out, "*********************************************\n");    

}
