###########################################################################
#
# textcat.pm -- Identify the language of a piece of text
#
#
# This file is based on TextCat version 1.08 by Gertjan van Noord
# Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl)
# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat 
#
# It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned
# into a package for use in Greenstone digital library system.  Most of
# the modifications consist of commenting out or deleting functionality
# I don't need.  
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package textcat;

# OPTIONS
my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";

my $opt_f = 1;                # Ngrams which occur <= this number of times are removed
my $opt_t = 400;              # topmost number of ngrams that should be used
my $opt_u = 1.05;             # how much worse result must be before it is ignored

my $non_word_characters = '0-9\s';

sub new {
    my $class = shift (@_);
    my $self = {};

    # open directory to find which languages are supported
    opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
    my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
    closedir DIR;
    @languages or die "sorry, can't read any language models from $model_dir\n" .
	"language models must reside in files with .lm ending\n";

    # load model and count for each language.
    foreach my $language (@languages) {
	my %ngram=();
	my $rang=1;
	open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
	while (<LM>) {
	    chomp;
	    # only use lines starting with appropriate character. Others are ignored.
	    if (/^[^$non_word_characters]+/o) {
		$self->{'ngrams'}->{$language}->{$&} = $rang++;
	    } 
	}
	close(LM);
    }

    $self->{'languages'} = \@languages;
    return bless $self, $class;
}



# CLASSIFICATION
#
# What language is a text string?
#   Input:  text string
#   Output: array of language names

sub classify {
    my ($self, $inputref)=@_;
    my %results = ();
    my $maxp = $opt_t;

    # create ngrams for input.
    my $unknown = $self->create_lm($inputref);

    foreach my $language (@{$self->{'languages'}}) {
    
	# compare language model with input ngrams list
	my ($i,$p)=(0,0);
	while ($i < scalar (@$unknown)) {
	    if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
		$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
	    } else { 
		$p=$p+$maxp; 
	    }
	    ++$i;
	}
	$results{$language} = $p;
    }

    my @results = sort { $results{$a} <=> $results{$b} } keys %results;
    my $a = $results{$results[0]};
  
    my @answers=(shift(@results));
    while (@results && $results{$results[0]} < ($opt_u *$a)) {
	@answers=(@answers,shift(@results));
    }

    return \@answers;
}

sub create_lm {
    # $ngram contains reference to the hash we build
    # then add the ngrams found in each word in the hash
    my ($self, $textref) = @_;  
    
    my $ngram = {};

    foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
	$word = "_" . $word . "_";
	my $len = length($word);
	my $flen=$len;
	my $i;

	for ($i=0; $i<$flen; $i++) {
	    $ngram->{substr($word,$i,5)}++ if $len > 4;
	    $ngram->{substr($word,$i,4)}++ if $len > 3;
	    $ngram->{substr($word,$i,3)}++ if $len > 2;
	    $ngram->{substr($word,$i,2)}++ if $len > 1;
	    $ngram->{substr($word,$i,1)}++;
	    $len--;
	}
    }

    map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
      } keys %$ngram;
  
    # sort the ngrams, and spit out the $opt_t frequent ones.
    # adding  `or $a cmp $b' in the sort block makes sorting five
    # times slower..., although it would be somewhat nicer (unique result)
    my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
    splice(@sorted,$opt_t) if (@sorted > $opt_t); 
    return \@sorted;
}

1;
