###########################################################################
#
# RecPlug.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# RecPlug is a plugin which recurses through directories processing
# each file it finds.

# RecPlug has one option: use_metadata_files.  When this is set, it will
# check each directory for an XML file called "metadata.xml" that specifies
# metadata for the files (and subdirectories) in the directory.
#
# Here's an example of a metadata file that uses three FileSet structures
# (ignore the # characters):

#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
#<DirectoryMetadata>
#  <FileSet>
#    <FileName>nugget.*</FileName>
#    <Description>
#      <Metadata name="Title">Nugget Point, The Catlins</Metadata>
#      <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
#    </Description>
#  </FileSet>
#  <FileSet>
#    <FileName>nugget-point-1.jpg</FileName>
#    <Description>
#      <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
#      <Metadata name="Subject">Lighthouse</Metadata>
#    </Description>
#  </FileSet>
#  <FileSet>
#    <FileName>kaka-point-dir</FileName>
#    <Description>
#      <Metadata name="Title">Kaka Point, The Catlins</Metadata>
#    </Description>
#  </FileSet>
#</DirectoryMetadata>

# Metadata elements are read and applied to files in the order they appear
# in the file.
#
# The FileName element describes the subfiles in the directory that the
# metadata applies to as a perl regular expression (a FileSet group may
# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
# indicates that the metadata records in the following Description block
# apply to every subfile that starts with "nugget".  For these files, a
# Title metadata element is set, overriding any old value that the Title
# might have had.
#
# Occasionally, we want to have multiple metadata values applied to a
# document; in this case we use the "mode=accumulate" attribute of the
# particular Metadata element.  In the second metadata element of the first
# FileSet above, the "Place" metadata is accumulating, and may therefore be
# given several values.  If we wanted to override these values and use a
# single metadata element again, we could set the mode attribute to
# "override" instead.  Remember: every element is assumed to be in override
# mode unless you specify otherwise, so if you want to accumulate metadata
# for some field, every occurance must have "mode=accumulate" specified.
#
# The second FileSet element above applies to a specific file, called
# nugget-point-1.jpg.  This element overrides the Title metadata set in the
# first FileSet, and adds a "Subject" metadata field.
#
# The third and final FileSet sets metadata for a subdirectory rather than
# a file.  The metadata specified (a Title) will be passed into the
# subdirectory and applied to every file that occurs in the subdirectory
# (and to every subsubdirectory and its contents, and so on) unless the
# metadata is explictly overridden later in the import.



package RecPlug;

use BasPlug;
use plugin;
use util;
use metadatautil;

use File::Basename;
use strict;
no strict 'refs';

BEGIN {
    @RecPlug::ISA = ('BasPlug');
}

my $arguments =
    [ { 'name' => "block_exp",
	'desc' => "{BasPlug.block_exp}",
	'type' => "regexp",
	'deft' => &get_default_block_exp(),
	'reqd' => "no" },
      # this option has been deprecated. leave it here for now so we can warn people not to use it
      { 'name' => "use_metadata_files",
	'desc' => "{RecPlug.use_metadata_files}",
	'type' => "flag",
	'reqd' => "no",
	'hiddengli' => "yes" },
      { 'name' => "recheck_directories",
	'desc' => "{RecPlug.recheck_directories}",
	'type' => "flag",
	'reqd' => "no" } ];
    
my $options = { 'name'     => "RecPlug",
		'desc'     => "{RecPlug.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};

    my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
    
    if ($self->{'info_only'}) {
	# don't worry about any options or initialisations etc
	return bless $self, $class;
    }

    # we have left this option in so we can warn people who are still using it
    if ($self->{'use_metadata_files'}) {
	die "ERROR: RecPlug -use_metadata_files option has been deprecated. Please remove the option and add MetadataXMLPlug to your plugin list instead!\n";
    }
	    
    $self->{'subdir_extrametakeys'} = {};

    return bless $self, $class;
}

sub begin {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;

    my $proc_package_name = ref $processor;

    if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {

	# Only lookup timestamp info for import.pl, and only if incremental is set

	my $output_dir = $processor->getoutputdir();
	my $archives_inf = &util::filename_cat($output_dir,"archives.inf");
    
	if ( -e $archives_inf ) {
	    $self->{'inf_timestamp'} = -M $archives_inf;
	}
    }

    $self->SUPER::begin($pluginfo, $base_dir, $processor, $maxdocs);
}


# return 1 if this class might recurse using $pluginfo
sub is_recursive {
    my $self = shift (@_);
    
    return 1;
}

sub get_default_block_exp {
    my $self = shift (@_);
    
    return 'CVS';
}

# return number of files processed, undef if can't process
# Note that $base_dir might be "" and that $file might 
# include directories

# This function passes around metadata hash structures.  Metadata hash
# structures are hashes that map from a (scalar) key (the metadata element
# name) to either a scalar metadata value or a reference to an array of
# such values.

sub read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $in_metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    
    my $outhandle = $self->{'outhandle'};
    my $verbosity = $self->{'verbosity'};
    my $read_metadata_files = $self->{'use_metadata_files'};

    # Calculate the directory name and ensure it is a directory and
    # that it is not explicitly blocked.
    my $dirname = $file;
    $dirname = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
    return undef unless (-d $dirname);
    return 0 if ($self->{'block_exp'} ne "" && $dirname =~ /$self->{'block_exp'}/);

    # check to make sure we're not reading the archives or index directory
    my $gsdlhome = quotemeta($ENV{'GSDLHOME'});
    if ($dirname =~ m/^$gsdlhome\/.*?\/import.*?\/(archives|index)$/) {
	print $outhandle "RecPlug: $dirname appears to be a reference to a Greenstone collection, skipping.\n";
        return 0;
    }
    
    # check to see we haven't got a cyclic path...
    if ($dirname =~ m%(/.*){,41}%) {
	print $outhandle "RecPlug: $dirname is 40 directories deep, is this a recursive path? if not increase constant in RecPlug.pm.\n";
	return 0;
    }
    
    # check to see we haven't got a cyclic path...
    if ($dirname =~ m%.*?import/(.+?)/import/\1.*%) {
	print $outhandle "RecPlug: $dirname appears to be in a recursive loop...\n";
	return 0;
    }
    
    if (($verbosity > 2) && ((scalar keys %$in_metadata) > 0)) {
        print $outhandle "RecPlug: metadata passed in: ", 
	join(", ", keys %$in_metadata), "\n";
    }
    
    # Recur over directory contents.
    my (@dir, $subfile);
    my $count = 0;
    
    print $outhandle "RecPlug: getting directory $dirname\n" if ($verbosity);
    
    # find all the files in the directory
    if (!opendir (DIR, $dirname)) {
	if ($gli) {
	    print STDERR "<ProcessingError n='$file' r='Could not read directory $dirname'>\n";
	}
	print $outhandle "RecPlug: WARNING - couldn't read directory $dirname\n";
	return -1; # error in processing
    }
    @dir = readdir (DIR);
    closedir (DIR);

    # Re-order the files in the list so any directories ending with .all are moved to the end
    for (my $i = scalar(@dir) - 1; $i >= 0; $i--) {
	if (-d &util::filename_cat($dirname, $dir[$i]) && $dir[$i] =~ /\.all$/) {
	    push(@dir, splice(@dir, $i, 1));
	}
    }

    # setup the metadata structures. we do a metadata_read pass to see if there is any additional metadata, then pass it to read
    
    my $additionalmetadata = 0;      # is there extra metadata available?
    my %extrametadata;               # maps from filespec to extra metadata keys
    my @extrametakeys;               # keys of %extrametadata in order read

    my $os_dirsep = &util::get_os_dirsep();
    my $dirsep    = &util::get_dirsep();
    my $base_dir_regexp = $base_dir;
    $base_dir_regexp =~ s/\//$os_dirsep/g;
    my $local_dirname = $dirname;
    $local_dirname =~ s/^$base_dir_regexp($os_dirsep)//;
    $local_dirname .= $dirsep;

    if (defined $self->{'subdir_extrametakeys'}->{$local_dirname}) {
	my $extrakeys = $self->{'subdir_extrametakeys'}->{$local_dirname};
	foreach my $ek (@$extrakeys) {
	    my $extrakeys_re  = $ek->{'re'};
	    my $extrakeys_md  = $ek->{'md'};
	    push(@extrametakeys,$extrakeys_re);
	    $extrametadata{$extrakeys_re} = $extrakeys_md;
	}
	delete($self->{'subdir_extrametakeys'}->{$local_dirname});
    }
    
    # apply metadata pass for each of the files in the directory
    my $out_metadata; 
    my $num_files = scalar(@dir);
    for (my $i = 0; $i < scalar(@dir); $i++) {
	my $subfile = $dir[$i];
	my $this_file_base_dir = $base_dir;
	last if ($maxdocs != -1 && $count >= $maxdocs);
	next if ($subfile =~ m/^\.\.?$/);

	# Recursively read each $subfile
	print $outhandle "RecPlug metadata recurring: $subfile\n" if ($verbosity > 2);
	
	$count += &plugin::metadata_read ($pluginfo, $this_file_base_dir,
					  &util::filename_cat($file, $subfile),
					  $out_metadata, \@extrametakeys, \%extrametadata,
					  $processor, $maxdocs, $gli);
	$additionalmetadata = 1;
    }
   
    # filter out any extrametakeys that mention subdirectories and store
    # for later use (i.e. when that sub-directory is being processed)

    foreach my $ek (@extrametakeys) {
	my ($subdir_re,$extrakey_dir) = &File::Basename::fileparse($ek);
	$extrakey_dir =~ s/\\\./\./g; # remove RE syntax

	my $dirsep_re = &util::get_re_dirsep();
	if ($ek =~ m/$dirsep_re/) { # specifies at least one directory
	    my $md = $extrametadata{$ek};

	    my $subdir_extrametakeys = $self->{'subdir_extrametakeys'};

	    my $subdir_rec = { 're' => $subdir_re, 'md' => $md };
	    push(@{$subdir_extrametakeys->{$extrakey_dir}},$subdir_rec);
	}
    }
    
    # import each of the files in the directory
    $count=0;
    for (my $i = 0; $i <= scalar(@dir); $i++) {
	# When every file in the directory has been done, pause for a moment (figuratively!)
	# If the -recheck_directories argument hasn't been provided, stop now (default)
	# Otherwise, re-read the contents of the directory to check for new files
	#   Any new files are added to the @dir list and are processed as normal
	#   This is necessary when documents to be indexed are specified in bibliographic DBs
	#   These files are copied/downloaded and stored in a new folder at import time
	if ($i == $num_files) {
	    last unless $self->{'recheck_directories'};

	    # Re-read the files in the directory to see if there are any new files
	    last if (!opendir (DIR, $dirname));
	    my @dirnow = readdir (DIR);
	    closedir (DIR);

	    # We're only interested if there are more files than there were before
	    last if (scalar(@dirnow) <= scalar(@dir));

	    # Any new files are added to the end of @dir to get processed by the loop
	    my $j;
	    foreach my $subfilenow (@dirnow) {
		for ($j = 0; $j < $num_files; $j++) {
		    last if ($subfilenow eq $dir[$j]);
		}
		if ($j == $num_files) {
		    # New file
		    push(@dir, $subfilenow);
		}
	    }
	    # When the new files have been processed, check again
	    $num_files = scalar(@dir);
	}

	my $subfile = $dir[$i];
	my $this_file_base_dir = $base_dir;
	last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
	next if ($subfile =~ /^\.\.?$/);

	# Follow Windows shortcuts
	if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
	    require Win32::Shortcut;
	    my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
	    if ($shortcut) {
		# The file to be processed is now the target of the shortcut
		$this_file_base_dir = "";
		$file = "";
		$subfile = $shortcut->Path;
	    }
	}

	# check for a symlink pointing back to a leading directory
	if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
	    # readlink gives a "fatal error" on systems that don't implement
	    # symlinks. This assumes the the -l test above would fail on those.
	    my $linkdest=readlink "$dirname/$subfile";
	    if (!defined ($linkdest)) {
		# system error - file not found?
		warn "RecPlug: symlink problem - $!";
	    } else {
		# see if link points to current or a parent directory
		if ($linkdest =~ m@^[\./\\]+$@ ||
		    index($dirname, $linkdest) != -1) {
		    warn "RecPlug: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
		    next;
		    ;
		}
	    }
	}

	print $outhandle "RecPlug: preparing metadata for $subfile\n" if ($verbosity > 2);

	# Make a copy of $in_metadata to pass to $subfile
	$out_metadata = {};
	&metadatautil::combine_metadata_structures($out_metadata, $in_metadata);

	# Next add metadata read in XML files (if it is supplied)
	if ($additionalmetadata == 1) {
	    
	    my ($filespec, $mdref);
	    foreach $filespec (@extrametakeys) {
		if ($subfile =~ /^$filespec$/) {
		    print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n" 
			if ($verbosity > 2);
		    $mdref = $extrametadata{$filespec};
		    &metadatautil::combine_metadata_structures($out_metadata, $mdref);
		}
	    }
	}


	my $file_subfile = &util::filename_cat($file, $subfile);
	my $filename_subfile 
	    = &util::filename_cat($this_file_base_dir,$file_subfile);
	if (defined $self->{'inf_timestamp'}) {
	    my $inf_timestamp = $self->{'inf_timestamp'};

	    if (! -d $filename_subfile) {
		my $filename_timestamp = -M $filename_subfile;
		if ($filename_timestamp > $inf_timestamp) {
		    # filename has been around for longer than inf
#####		    print $outhandle "**** Skipping $subfile\n";
		    next;
		}
	    }
	}

	# Recursively read each $subfile
	print $outhandle "RecPlug recurring: $subfile\n" if ($verbosity > 2);
	
	$count += &plugin::read ($pluginfo, $this_file_base_dir,
				 $file_subfile,
				 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
    }

    return $count;
}

1;
