#!/usr/bin/perl


BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
}

use strict;
no strict 'subs'; # allow barewords (eg STDERR) as function arguments
no strict 'refs'; # allow filehandles to be variables and vice versa

use encodings;
use printusage;
use parse2;
use FileHandle;

my $unicode_list =
    [ { 'name' => "auto",
	'desc' => "{BasPlug.input_encoding.auto}" },
      { 'name' => "ascii",
	'desc' => "{BasPlug.input_encoding.ascii}" },
      { 'name' => "utf8",
	'desc' => "{BasPlug.input_encoding.utf8}" },
      { 'name' => "unicode",
	'desc' => "{BasPlug.input_encoding.unicode}" } ];

my $e = $encodings::encodings;
foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) 
{
    my $hashEncode =
    {'name' => $enc,
     'desc' => $e->{$enc}->{'name'}};
    
    push(@{$unicode_list},$hashEncode);
}

my $arguments = 
    [ 
      { 'name' => "language",
	'desc' => "{scripts.language}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "plugin",
	'desc' => "{explode.plugin}",
	'type' => "string",
	'reqd' => "yes",
	'hiddengli' => "yes"},
      { 'name' => "input_encoding",
	'desc' => "{explode.encoding}",
	'type' => "enum",
	'deft' => "auto",
	'list' => $unicode_list,
	'reqd' => "no" },
      { 'name' => "metadata_set",
	'desc' => "{explode.metadata_set}",
	'type' => "string",
	'reqd' => "no" },
      { 'name' => "document_field",
	'desc' => "{explode.document_field}",
	'type' => "string",
	'reqd' => "no"},
       { 'name' => "document_prefix",
	'desc' => "{explode.document_prefix}",
	'type' => "string",
	'reqd' => "no"},
      { 'name' => "document_suffix",
	'desc' => "{explode.document_suffix}",
	'type' => "string",
	'reqd' => "no"},
      { 'name' => "records_per_folder",
	'desc' => "{explode.records_per_folder}",
	'type' => "int",
	'range' => "0,",
	'deft' => "100",
	'reqd' => "no" },
      { 'name' => "verbosity",
	'desc' => "{import.verbosity}",
	'type' => "int",
	'range' => "0,",
	'deft' => "1",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "xml",
	'desc' => "",
	'type' => "flag",
	'reqd' => "no",
	'hiddengli' => "yes" }
      ];
	
my $options = { 'name' => "explode_metadata_database.pl",
		'desc' => "{explode.desc}",
		'args' => $arguments };

	    
sub main
{
    my ($language, $input_encoding, $metadata_set, $plugin, 
	$document_field, $document_prefix, $document_suffix, $records_per_folder, $verbosity);

    my $xml = 0;

    my $hashParsingResult = {};
    # parse the options
    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");

    # If parse returns -1 then something has gone wrong
    if ($intArgLeftinAfterParsing == -1)
    {
	&PrintUsage::print_txt_usage($options, "{explode.params}");
	die "\n";
    }

    foreach my $strVariable (keys %$hashParsingResult)
    {
	eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
    }

    # If $language has been specified, load the appropriate resource bundle
    # (Otherwise, the default resource bundle will be loaded automatically)
    if ($language && $language =~ /\S/) {
	&gsprintf::load_language_specific_resource_bundle($language);
    }

    if ($xml) {
        &PrintUsage::print_xml_usage($options);
	print "\n";
	return;
    }

    # There should one arg left after parsing (the filename)
    # Or the user may have specified -h, in which case we output the usage
    if($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
    {
	&PrintUsage::print_txt_usage($options, "{explode.params}");
	die "\n";
    }

    # The metadata database filename is the first value that remains after the options have been parsed out
    my $filename = $ARGV[0];
    if (!defined $filename || $filename !~ /\w/) { 
	&PrintUsage::print_txt_usage($options, "{explode.params}");
	print STDERR "You need to specify a filename";
	die "\n";
    }
    # check that file exists
    if (!-e $filename) {
	print STDERR "File $filename doesn't exist...\n";
	die "\n";
    }
    # check required options
    if (!defined $plugin || $plugin !~ /\w/) {
	&PrintUsage::print_txt_usage($options, "{explode.params}");
	print STDERR "You need to specify a plugin";
	die "\n";
    }
    
    # check metadata set
    if (defined $metadata_set && $metadata_set =~ /\w/) {
	$metadata_set .= ".";
    } else {
	$metadata_set = "";
    }

    my $plugobj;
    require "$plugin.pm";
    eval ("\$plugobj = new $plugin()");
    die "$@" if $@;

    # ...and initialize it
    $plugobj->init(1, "STDERR", "STDERR");
    
    if ($input_encoding eq "auto") {
	($language, $input_encoding) = $plugobj->textcat_get_language_encoding ($filename);
    }

    my $text = "";
    # Use the plugin's read_file function to avoid duplicating code
    $plugobj->read_file($filename, $input_encoding, undef, \$text);
    # is there any text in the file??
    die "\n" unless length($text);

    # Create a directory to store the document files...
    my ($documents_directory_base) = ($filename =~ /(.*)\.[^\.]+$/);

    # Split the text into records, using the plugin's split_exp
    my $split_exp = $plugobj->{'split_exp'};
    my @metadata_records = split(/$split_exp/, $text);
    print STDERR "Number of records: " . scalar(@metadata_records) . "\n";

    # Write the metadata from each record to the metadata.xml file
    my $record_number = 1;
    my $documents_directory;
    foreach my $record_text (@metadata_records) {
	# Check if we need to start a new directory for these records
	if (($record_number % $records_per_folder) == 1) {
	    $documents_directory = $documents_directory_base;
	    if (scalar(@metadata_records) > $records_per_folder) {
		$documents_directory .= "." . sprintf("%8.8d", $record_number);
	    }
	    if (-d $documents_directory) {
		die "Error: document directory $documents_directory already exists (bailing).\n";
	    }
	    &util::mk_dir($documents_directory);

	    my $documents_metadata_xml_file = &util::filename_cat($documents_directory, "metadata.xml");
	    if (-e $documents_metadata_xml_file) {
		die "Error: documents metadata.xml file $documents_metadata_xml_file already exists (bailing).\n";
	    }

	    # Start the metadata.xml file
	    open(METADATA_XML_FILE, ">$documents_metadata_xml_file");
	    print METADATA_XML_FILE
		"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
		"<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
		"<DirectoryMetadata>\n";
	}

	# Use the plugin's process function to avoid duplicating code
	my $doc_obj = new doc($filename, "nonindexed_doc");
	$plugobj->process(\$record_text, undef, undef, $filename, undef, $doc_obj, 0);
	# Get all the metadata assigned to this record
	my $record_metadata = $doc_obj->get_all_metadata($doc_obj->get_top_section());
	my $document_file;
	
	# try to get a doc to attach the metadata to
	if (defined $document_field) {
	    foreach my $pair (@$record_metadata) {
		my ($field, $value) = (@$pair);
		$value =~ s/\\\\/\\/g;

		# Does this metadata element specify a document to obtain?
		if ($field eq $document_field) {
		    my $document_file_full = $document_prefix . $value . $document_suffix;
		    $document_file = &obtain_document($document_file_full, $documents_directory, $verbosity);
		    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
		}
	    }
	}
	# Create a dummy .nul file if we haven't obtained any documents for this record
	if (not defined $document_file) {
	    $document_file = sprintf("%8.8d", $record_number) . ".nul";
	    open(DUMMY_FILE, ">$documents_directory/$document_file");
	    close(DUMMY_FILE);
	    &write_metadata_xml_file_entry(METADATA_XML_FILE, $document_file, $record_metadata, $metadata_set);
	}

	if (($record_number % $records_per_folder) == 0 || $record_number == scalar(@metadata_records)) {
	    # Finish and close the metadata.xml file
	    print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
	    close(METADATA_XML_FILE);
	}
	$record_number = $record_number + 1;
    }

    # Explode means just that: the original file is deleted
    &util::rm($filename);
    $plugobj->clean_up_after_exploding();
}


sub write_metadata_xml_file_entry
{
    my $metadata_xml_file = shift(@_);
    my $file_name = shift(@_);
    my $record_metadata = shift(@_);
    my $meta_prefix = shift(@_);
    
    # Make $file_name XML-safe
    $file_name =~ s/&/&amp;/g;
    $file_name =~ s/</&lt;/g;
    $file_name =~ s/>/&gt;/g;

    # Convert $file_name into a regular expression that matches it
    $file_name =~ s/\./\\\./g;
    $file_name =~ s/\(/\\\(/g;
    $file_name =~ s/\)/\\\)/g;
    $file_name =~ s/\{/\\\{/g;
    $file_name =~ s/\}/\\\}/g;
    $file_name =~ s/\[/\\\[/g;
    $file_name =~ s/\]/\\\]/g;
    
    print $metadata_xml_file
	"\n" .
        "  <FileSet>\n" .
	"    <FileName>$file_name</FileName>\n" .
	"    <Description>\n";

    foreach my $pair (@$record_metadata) {
	my ($field, $value) = (@$pair);

	# We're only interested in metadata from the database
	next if ($field eq "lastmodified");
	next if ($field eq "gsdlsourcefilename");
	next if ($field eq "gsdldoctype");
	next if ($field eq "FileFormat");

	# Ignore the ^all metadata, since it will be invalid if the source metadata is changed
	next if ($field =~ /\^all$/);  # ISISPlug specific!

	# Make $value XML-safe
	$value =~ s/&/&amp;/g;  # May mess up existing entities!
	$value =~ s/</&lt;/g;
	$value =~ s/>/&gt;/g;

	# we are not allowed & in xml except in entities. 
	# if there are undefined entities then parsing will also crap out.
	# should we be checking for them too?
	# this may not get all possibilities
	# $value =~ s/&([^;\s]*(\s|$))/&amp;$1/g;

	print $metadata_xml_file "      <Metadata mode=\"accumulate\" name=\"$meta_prefix$field\">$value</Metadata>\n";
    }

    print $metadata_xml_file
	"    </Description>\n" .
	    "  </FileSet>\n";
}

sub obtain_document
{
    my $document_file_full = shift(@_);
    my $documents_directory = shift(@_);
    my $verbosity = shift(@_);
    
    print STDERR "Obtaining document file $document_file_full...\n" if ($verbosity > 1);

    my $document_file_name;
    my $local_document_file;

    # Document specified is on the web
    if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {
	$document_file_full =~ /([^\/]+)$/;
	$document_file_name = $1;
	$local_document_file = &util::filename_cat($documents_directory, $document_file_name);

	my $wget_options = "--quiet";
	$wget_options = "--verbose" if ($verbosity > 2);
	$wget_options .= " --timestamping";  # Only re-download files if they're newer
	my $wget_command = "wget $wget_options \"$document_file_full\" --output-document \"$local_document_file\"";
	`$wget_command`;

	# Check the document was obtained successfully
	if (!-e $local_document_file) {
	    print STDERR "WARNING: Could not obtain document file $document_file_full\n";
	}
    }
    # Document specified is on the disk
    else {
	my $dir_sep = &util::get_os_dirsep();
	$document_file_full =~ /(.+$dir_sep)?(.*)$/;
	$document_file_name = $2;
	$local_document_file = &util::filename_cat($documents_directory, $document_file_name);

	# Only bother trying to copy the file if it contained some path information
	if ($document_file_full ne $document_file_name) {
	    &util::cp($document_file_full, $documents_directory);

	    # Check the document was obtained successfully
	    if (!-e $local_document_file) {
		print STDERR "WARNING: Could not obtain document file $document_file_full\n";
	    }
	}
    }

    # If the document wasn't obtained successfully, create a .nul file for it
    if (!-e $local_document_file) {
	$document_file_name .= ".nul";
	open(NULL_FILE, ">$local_document_file.nul");
	close(NULL_FILE);
    }

    return $document_file_name;
}

&main(@ARGV);
