#!/usr/bin/perl -w

## @file buildcol.pl
# This program will build a particular collection.
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# @note 11/04/03 Added usage datastructure - John Thompson
#
# @author New Zealand Digital Library Project unless otherwise stated
# @copy 1999 New Zealand Digital Library Project
#
package buildcol;

BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
}

use colcfg;
use util;
use scriptutil;
use FileHandle;
use gsprintf;
use printusage;
use parse2;

use strict;
no strict 'refs'; # allow filehandles to be variables and vice versa
no strict 'subs'; # allow barewords (eg STDERR) as function arguments

my $mode_list =
    [ { 'name' => "all",
        'desc' => "{buildcol.mode.all}" },
      { 'name' => "compress_text",
        'desc' => "{buildcol.mode.compress_text}" },
      { 'name' => "build_index",
        'desc' => "{buildcol.mode.build_index}" },
      { 'name' => "infodb",
        'desc' => "{buildcol.mode.infodb}" } ];

my $sec_index_list = 
    [ {'name' => "never",
       'desc' => "{buildcol.sections_index_document_metadata.never}" },
      {'name' => "always",
       'desc' => "{buildcol.sections_index_document_metadata.always}" },
      {'name' => "unless_section_metadata_exists",
       'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
      ];

my $arguments =
    [ { 'name' => "remove_empty_classifications",
	'desc' => "{buildcol.remove_empty_classifications}",
	'type' => "flag",
	'reqd' => "no",
	'modegli' => "2" },
      { 'name' => "archivedir",
	'desc' => "{buildcol.archivedir}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "builddir",
	'desc' => "{buildcol.builddir}",
	'type' => "string",
	'reqd' => "no",
        'hiddengli' => "yes" },
#     { 'name' => "cachedir",
#	'desc' => "{buildcol.cachedir}",
#	'type' => "string",
#	'reqd' => "no" },
      { 'name' => "collectdir",
	'desc' => "{buildcol.collectdir}",
	'type' => "string",
	# parsearg left "" as default
	#'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "create_images",
	'desc' => "{buildcol.create_images}",
	'type' => "flag",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "debug",
	'desc' => "{buildcol.debug}",
	'type' => "flag",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "faillog",
	'desc' => "{buildcol.faillog}",
	'type' => "string",
	# parsearg left "" as default
	#'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "index",
	'desc' => "{buildcol.index}",
	'type' => "string",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "incremental",
	'desc' => "{buildcol.incremental}",
	'type' => "flag",
	'hiddengli' => "yes" },
      { 'name' => "keepold",
	'desc' => "{buildcol.keepold}",
	'type' => "flag",
	'reqd' => "no",
        #'modegli' => "3",
	'hiddengli' => "yes" },
      { 'name' => "removeold",
	'desc' => "{buildcol.removeold}",
	'type' => "flag",
	'reqd' => "no",
	#'modegli' => "3",
	'hiddengli' => "yes"  },
      { 'name' => "language",
	'desc' => "{scripts.language}",
	'type' => "string",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "maxdocs",
	'desc' => "{buildcol.maxdocs}",
	'type' => "int",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "maxnumeric",
	'desc' => "{buildcol.maxnumeric}",
	'type' => "int",
	'reqd' => "no",
	'deft' => "4",
	'range' => "4,512",
	'modegli' => "3" },
      { 'name' => "mode",
	'desc' => "{buildcol.mode}",
	'type' => "enum",
	'list' => $mode_list,
	# parsearg left "" as default
#	'deft' => "all",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "no_strip_html",
	'desc' => "{buildcol.no_strip_html}",
	'type' => "flag",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "no_text",
	'desc' => "{buildcol.no_text}",
	'type' => "flag",
	'reqd' => "no",
	'modegli' => "3" },
      { 'name' => "sections_index_document_metadata",
	'desc' => "{buildcol.sections_index_document_metadata}",
	'type' => "enum",
	'list' => $sec_index_list,
	'reqd' => "no",
	'modegli' => "3" },
      { 'name' => "out",
	'desc' => "{buildcol.out}",
	'type' => "string",
	'deft' => "STDERR",
	'reqd' => "no",
        'hiddengli' => "yes" },
      { 'name' => "verbosity",
	'desc' => "{buildcol.verbosity}",
	'type' => "int",
	# parsearg left "" as default
	#'deft' => "2",
	'reqd' => "no",
	'modegli' => "4" },
      { 'name' => "gli",
	'desc' => "",
	'type' => "flag",
	'reqd' => "no",
	'hiddengli' => "yes" },
      { 'name' => "xml",
	'desc' => "{scripts.xml}",
	'type' => "flag",
	'reqd' => "no",
	'hiddengli' => "yes" }

#      { 'name' => "incremental_dlc",
#	'desc' => "{buildcol.incremental_dlc}",
#	'type' => "flag",
#	'reqd' => "no",
#        'hiddengli' => "yes" } 
      ];

my $options = { 'name' => "buildcol.pl",
		'desc' => "{buildcol.desc}",
		'args' => $arguments };


# globals
my $collection;
my $configfilename;
my $out;

## @method gsprintf()
#  Print a string to the screen after looking it up from a locale dependant
#  strings file. This function is losely based on the idea of resource
#  bundles as used in Java.
#
#  @param  $error The STDERR stream.
#  @param  $text The string containing GS keys that should be replaced with
#                their locale dependant equivilents.
#  @param  $out The output stream.
#  @return The locale-based string to output.
#
sub gsprintf()
{
    return &gsprintf::gsprintf(@_);
}
## gsprintf() ##

&main();

## @method main()
#
#  [Parses up and validates the arguments to the build process before creating
#  the appropriate build process to do the actual work - John]
#
#  @note Added true incremental support - John Thompson, DL Consulting Ltd.
#  @note There were several bugs regarding using directories other than 
#        "import" or "archives" during import and build quashed. - John 
#        Thompson, DL Consulting Ltd.
#
#  @param  $incremental_dlc If true indicates this build should not regenerate all
#                       the index and metadata files, and should instead just
#                       append the information found in the archives directory
#                       to the existing files. If this requires some complex
#                       work so as to correctly insert into a classifier so be
#                       it. Of course none of this is done here - instead the
#                       incremental argument is passed to the document
#                       processor.
#
sub main
{
    # command line args
    my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs, 
	$debug, $mode, $indexname, $removeold, $keepold, $incremental, 
	$remove_empty_classifications,
	$create_images, $collectdir, $build, $type, $textindex,
	$no_strip_html, $no_text, $faillog, $gli, $index, $language,
	$sections_index_document_metadata, $maxnumeric, $incremental_dlc);

    my $xml = 0;
    $incremental_dlc = 0; # this is hidden for now as it doesn't work for standard greenstone at the moment
    my $hashParsingResult = {};
    # general options available to all plugins
    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");

    # If parse returns -1 then something has gone wrong
    if ($intArgLeftinAfterParsing == -1)
    {
	&PrintUsage::print_txt_usage($options, "{buildcol.params}");
	die "\n";
    }
    
    foreach my $strVariable (keys %$hashParsingResult)
    {
	eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
    }

    # If $language has been specified, load the appropriate resource bundle
    # (Otherwise, the default resource bundle will be loaded automatically)
    if ($language && $language =~ /\S/) {
	&gsprintf::load_language_specific_resource_bundle($language);
    }

    if ($xml) {
        &PrintUsage::print_xml_usage($options);
	print "\n";
	return;
    }

    if ($gli) { # the gli wants strings to be in UTF-8
	&gsprintf::output_strings_in_UTF8; 
    }

    # now check that we had exactly one leftover arg, which should be 
    # the collection name. We don't want to do this earlier, cos 
    # -xml arg doesn't need a collection name
    # Or if the user specified -h, then we output the usage also
    if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
    {
	&PrintUsage::print_txt_usage($options, "{buildcol.params}");
	die "\n";
    }
    
    $textindex = "";
    my $close_out = 0;
    if ($out !~ /^(STDERR|STDOUT)$/i) {
	open (OUT, ">$out") ||
	    (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
	$out = "buildcol::OUT";
	$close_out = 1;
    }
    $out->autoflush(1);

    # get and check the collection
    if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
	&PrintUsage::print_txt_usage($options, "{buildcol.params}");
	die "\n";
    }

    if ($faillog eq "") {
	$faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
    }
    # note that we're appending to the faillog here (import.pl clears it each time)
    # this could potentially create a situation where the faillog keeps being added 
    # to over multiple builds (if the import process is being skipped)
    open (FAILLOG, ">>$faillog") ||
	(&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
    $faillog = 'buildcol::FAILLOG';
    $faillog->autoflush(1);

    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
    # Don't know why this didn't already happen, but now collection specific
    # classify and plugins directory also added to include path
    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify");
    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins");

    # read the configuration file
    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
    my ($collectcfg, $buildtype); 

    if (!-e $configfilename) {
	&gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die;
    }
    
    $collectcfg = &colcfg::read_collect_cfg ($configfilename);
    
    if ($verbosity !~ /\d+/) {
	if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
	    $verbosity = $collectcfg->{'verbosity'};
	} else {
	    $verbosity = 2; # the default
	}
    }
    # we use searchtype for determining buildtype, but for old versions, use buildtype
    if (defined $collectcfg->{'buildtype'}) {
	$buildtype = $collectcfg->{'buildtype'};
    } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
	$buildtype = "mgpp";
    } else {
	$buildtype = "mg"; #mg is the default
    }
    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
	$archivedir = $collectcfg->{'archivedir'};
    }
    if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
	$cachedir = $collectcfg->{'cachedir'};
    }
    if (defined $collectcfg->{'builddir'} && $builddir eq "") {
	$builddir = $collectcfg->{'builddir'};
    }
    if ($maxdocs !~ /\-?\d+/) {
	if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
	    $maxdocs = $collectcfg->{'maxdocs'};
	} else {
	    $maxdocs = -1; # the default
	}
    }
    if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
	$maxnumeric = $collectcfg->{'maxnumeric'};
    } 
    
    if ($maxnumeric < 4 || $maxnumeric > 512) {
	$maxnumeric = 4;
    }
    
    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
	$debug = 1;
    }
    if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
	if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
	    $mode = $collectcfg->{'mode'};
	} else {
	    $mode = "all"; # the default
	}
    }
    if (defined $collectcfg->{'index'} && $indexname eq "") {
	$indexname = $collectcfg->{'index'};
    }
    if (defined $collectcfg->{'no_text'} && $no_text == 0) {
	if ($collectcfg->{'no_text'} =~ /^true$/i) {
	    $no_text = 1;
	}
    }
    if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
	if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
	    $no_strip_html = 1;
	}
    }
    if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
	if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
	    $remove_empty_classifications = 1;
	}
    }
    
    
    if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
	$create_images = 1;
    }
    if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
	$textindex = $collectcfg->{'textcompress'};
    }
    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
	$gli = 1;
    }

    if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
	$sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
    }
    
    if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
	$sections_index_document_metadata = "never";
    }
    
    ($removeold, $keepold, $incremental) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, $incremental, "building", $collectcfg);
 
    $gli = 0 unless defined $gli;

    # New argument to track whether build is incremental
    $incremental = 0 unless defined $incremental;

    print STDERR "<Build>\n" if $gli;

    #set the text index
    if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
	if ($textindex eq "") {
	    $textindex = "text";
	}
    }
    else {
	$textindex = "section:text";
    }

    # create default images if required
    if ($create_images) {
	my $collection_name = $collection;
	$collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'}
	if defined $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'};
	&create_images ($collection_name);
    }

    # fill in the default archives and building directories if none
    # were supplied, turn all \ into / and remove trailing /

    my ($realarchivedir, $realbuilddir);
    # Modified so that the archivedir, if provided as an argument, is made
    # absolute if it isn't already
    if ($archivedir eq "")
      {
        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
      }
    else
      {
        $archivedir = &make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
      }
    # End Mod
    $archivedir =~ s/[\\\/]+/\//g;
    $archivedir =~ s/\/$//;
    $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
    $builddir =~ s/[\\\/]+/\//g;
    $builddir =~ s/\/$//;

    # update the archive cache if needed
    if ($cachedir) {
	&gsprintf($out, "{buildcol.updating_archive_cache}\n")
	    if ($verbosity >= 1);

	$cachedir =~ s/[\\\/]+$//;
	$cachedir .= "/collect/$collection" unless 
	    $cachedir =~ /collect\/$collection/;

	$realarchivedir = "$cachedir/archives";
	$realbuilddir = "$cachedir/building";
	&util::mk_all_dir ($realarchivedir);
	&util::mk_all_dir ($realbuilddir);
	&util::cachedir ($archivedir, $realarchivedir, $verbosity);

    } else {
	$realarchivedir = $archivedir;
	$realbuilddir = $builddir;
    }

    # build it in realbuilddir
    &util::mk_all_dir ($realbuilddir);

    my ($buildertype, $builderdir,  $builder);
    # if a builder class has been created for this collection, use it
    # otherwise, use the mg or mgpp builder
    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
	$builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
	$buildertype = "${collection}builder";
    } else {
	$builderdir = "$ENV{'GSDLHOME'}/perllib";
	if ($buildtype eq "lucene") {
	    $buildertype = "lucenebuilder";
	}
	elsif ($buildtype eq "mgpp") {
	    $buildertype = "mgppbuilder";
	}
	else {
	    $buildertype = "mgbuilder";
	}
    }
	
    require "$builderdir/$buildertype.pm";

    eval("\$builder = new $buildertype(\$collection, " .
	 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
	 "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_dlc, " .
	 "\$remove_empty_classifications, " .
	 "\$out, \$no_text, \$faillog, \$gli)");
    die "$@" if $@;

    $builder->init();
    $builder->set_maxnumeric($maxnumeric);
    
    if (($buildertype eq "mgppbuilder") && $no_strip_html) {
	$builder->set_strip_html(0);
    }
    if ($sections_index_document_metadata ne "never") {
	$builder->set_sections_index_document_metadata($sections_index_document_metadata);
    }
        
    if ($mode =~ /^all$/i) {
	$builder->compress_text($textindex);
	$builder->build_indexes($indexname);
	$builder->make_infodatabase();
	$builder->collect_specific();
    } elsif ($mode =~ /^compress_text$/i) {
	$builder->compress_text($textindex);
    } elsif ($mode =~ /^build_index$/i) {
	$builder->build_indexes($indexname);	
    } elsif ($mode =~ /^infodb$/i) {
	$builder->make_infodatabase();
    } else {
	(&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
    }

    $builder->make_auxiliary_files() if !$debug;
    $builder->deinit();
    
    if (($realbuilddir ne $builddir) && !$debug) {
	&gsprintf($out, "{buildcol.copying_back_cached_build}\n")
	    if ($verbosity >= 1);
	&util::rm_r ($builddir);
	&util::cp_r ($realbuilddir, $builddir);
    }

    close OUT if $close_out;
    close FAILLOG;

    print STDERR "</Build>\n" if $gli;
}
## main() ##

## @method create_images()
#
#  [Used to create default cover images... from what I'm not quite sure - John]
#
sub create_images {
    my ($collection_name) = @_;

    my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon-1.2.pl");
    if (!-e $image_script) {
	&gsprintf($out, "{buildcol.no_image_script}", $image_script);
	&gsprintf($out, "{buildcol.no_default_images}\n\n");
	return;
    }

    my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");

    &util::mk_all_dir ($imagedir);

    # create the images
    system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
    system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");

    # update the collect.cfg configuration file (this will need
    # to be changed when the config file format changes)
    if (!open (CFGFILE, $configfilename)) {
	&gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
	&gsprintf($out, "{buildcol.unlinked_col_images}\n");
	return;
    }

    my $line = ""; my $file = "";
    my $found = 0; my $foundsm = 0;
    while (defined ($line = <CFGFILE>)) {
	if ($line =~ /collectionmeta\s+iconcollection\s+/) {
	    $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
	    $found = 1;
	} elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
	    $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
	    $foundsm = 1;
	}
	$file .= $line;
    }
    close CFGFILE;

    $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
    $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;

    if (!open (CFGFILE, ">$configfilename")) {
	&gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
	&gsprintf($out, "{buildcol.unlinked_col_images}\n");
	return;
    }
    print CFGFILE $file;
    close CFGFILE;
}
## create_images() ##

## @method make_absolute()
#
#  Ensure the given file path is absolute in respect to the given base path.
#
#  @param  $base_dir A string denoting the base path the given dir must be
#                    absolute to.
#  @param  $dir The directory to be made absolute as a string. Note that the
#               dir may already be absolute, in which case it will remain
#               unchanged.
#  @return The now absolute form of the directory as a string.
#
#  @author John Thompson, DL Consulting Ltd.
#  @copy 2006 DL Consulting Ltd.
#
sub make_absolute()
  {
    my ($base_dir, $dir) = @_;
    $dir = $base_dir . "/$dir" if ($dir =~ m#^[^/]#);
    $dir =~ s|^/tmp_mnt||;
    1 while($dir =~ s|/[^/]*/\.\./|/|g);
    $dir =~ s|/[.][.]?/|/|g;
    $dir =~ tr|/|/|s;
    return $dir;
  }
## make_absolute() ##
