#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
use DBI;
use charnames ':full';

my $revision='$Revision: 838 $';
my $verbose = 0;
my ($help, $slang, $dlang);

GetOptions(
  "verbose+" => \$verbose,
  "help|?" => \$help,
  "source-language=s" => \$slang,
  "destination-language=s" => \$dlang
  ) or exit 1;

pod2usage -exitval => 1, -verbose => 2 if $help;

print STDERR "Revision: $revision\n" if $verbose;


sub in_array
{
  my($arrayref, $what) = @_;
  local $_;
  for (@$arrayref)
  { return 1 if $_ eq $what }
  return 0
}

sub xmlEscape
{
  my $s = shift;
  $s =~ s/&/&amp;/g;
  $s =~ s/\"/&quot;/g;
  $s =~ s/\'/&apos;/g;
  $s =~ s/</&lt;/g;
  $s
}

# For the hash values see also The FreeDict HOWTO, Table 5.1. Part of Speech
# Typology.  The Ergane typology is more fine grained.
my %t2p =
(
  0 => 'n', # Nomen
  1 => 'adj', # Adjective
  2 => 'adv', # Adverb
  3 => 'v', # Verb
  4 => 'num', # Numeral
  5 => 'prep', # Preposition
  6 => 'int', # Interjection
  7 => 'pron', # Relative Pronoun
  8 => 'conj', # Conjunction
  10 => 'pron', # Pronoun
  11 => 'art', # Article
  12 => 'pron', # Personal Pronoun
  13 => 'pron', # Demonstrative Pronoun
  14 => 'pron', # Indefinite Pronoun
  15 => 'pron', # Reflexive Pronoun
  16 => 'pron', # Possesive Pronoun
  17 => 'num', # Ordinal Number
  20 => 'pron', # Interrogative Pronoun
  23 => 'n', # Nomen Proprium
);

sub posType2Pos
{
  my $Type = shift;
  return $t2p{$Type} if exists $t2p{$Type};
  return undef
}

sub genType2Gen
{
  my $Type = lc shift;# assume case doesn't matter (dut.mdb contains M F N)
  return $Type if $Type eq 'm' or $Type eq 'f' or $Type eq 'n';
  return undef
}

sub to_unicode
{
  my($Schrift, $word, $where) = @_;
  my $result = '';
  for (split //, $word)
  {
    unless(exists $Schrift->{$_})
    {
      for (sort keys %$Schrift)
      {
	print STDERR "$_\t$Schrift->{$_}\n" if $verbose>1
      }
      print STDERR "Headword '" .
	(defined($where) ? "$where': Translation '" : '') .
	"$word': Undefined char mapping for " .
	"char '$_' (" . ord($_) . ") - using identity\n" if $verbose;
	    $Schrift->{$_} = $_;
    }
    $result .= $Schrift->{$_}
  }
  return $result
}

my (%POSTypesCount, %genTypesCount, $sSchrift, $dSchrift);

sub formatEntry
{
  my($XEntry, $POSType, $genType, @senses) = @_;
  my $orth = xmlEscape to_unicode $sSchrift, $XEntry;
  $POSTypesCount{$POSType}++;
  $genTypesCount{$genType}++;
  my $pos = posType2Pos $POSType;
  my $gen = genType2Gen $genType;
  print STDERR "Line $.: Unknown Part-of-Speech type $POSType ('$XEntry')\n"
    unless defined $pos || $POSTypesCount{$POSType}>1;
  print STDERR "Line $.: Unknown Genus type $genType ('$XEntry')\n"
    unless defined $gen || $genTypesCount{$genType}>1;
  my $gramGrp = '';
  if (defined($pos) or defined($gen))
  {
    $gramGrp = "\n  <gramGrp>";
    $gramGrp .= "<pos>$pos</pos>" if defined $pos;
    $gramGrp .= "<gen>$gen</gen>" if defined $gen;
    $gramGrp .= "</gramGrp>"
  }
  my $senses = '';
  for (@senses)
  {
    my @transs = ();
    for my $trans1 (@{$$_{'transs'}})
    {
      $trans1 =~ s/\s+$//;
      my $trans = "    <trans><tr>" .
        xmlEscape(to_unicode $dSchrift, $trans1, $orth) .
        "</tr></trans>\n";
      push @transs, $trans unless in_array \@transs, $trans
    }
    $senses .= "  <sense>\n" . join('', @transs) ."  </sense>\n"
  }
  print <<"END";
<entry>
  <form><orth>$orth</orth></form>$gramGrp
$senses</entry>
END
}

die "source and destination language have to be given"
  unless defined($slang) && defined($dlang);

# get KB_Map for $slang and $dlang
my $dbh = DBI->connect("dbi:Pg:dbname=$ENV{USER}",
  undef, undef, { RaiseError => 1 });

sub get_schrift
{
  my $lang = shift;
  # exceptions...
  $lang = 'ces' if $lang eq 'cze';
  $lang = 'dut' if $lang eq 'nld';
  $lang = 'gae' if $lang eq 'sco';
  $lang = 'sve' if $lang eq 'swe';
  my ($Schrift1) = $dbh->selectrow_array(
    "SELECT Schrift FROM Talen WHERE TaalCode='$lang'");
  unless(defined $Schrift1)
  {
    print STDERR "$lang Schrift index not in 'Talen' table.  Setting it to 0.\n";
    $Schrift1 = 0
  }
  print STDERR "$lang Schrift index: $Schrift1\n" if $verbose;
  my $Schrift_numeric = $dbh->selectall_hashref(
    "SELECT keyascii, UnicodeVal FROM KB_Map " .
    "WHERE Schrift='$Schrift1'", 'keyascii');
  my $Schrift;
  # transform numeric indices of KB_Map into characters
  for (keys %$Schrift_numeric)
  {
    my $v = $Schrift_numeric->{$_};
    $Schrift->{chr $_ } = chr $v->{'unicodeval'}
  }
  if($Schrift1 == 2) # cyrillic
  {
    $Schrift->{'<'} = '';
    $Schrift->{'>'} = '';
    $Schrift->{'$'} = "\N{CYRILLIC SMALL LETTER SHCHA}";
    $Schrift->{'j'} = "\N{CYRILLIC SMALL LETTER SHORT I}";
    $Schrift->{'J'} = "\N{CYRILLIC CAPITAL LETTER SHORT I}";
    $Schrift->{chr 96} = "\N{CYRILLIC SMALL LETTER HARD SIGN}";
    $Schrift->{chr 192} = "\N{CYRILLIC CAPITAL LETTER YA}";
    $Schrift->{chr 200} = "\N{CYRILLIC CAPITAL LETTER E}";
    $Schrift->{chr 217} = "\N{CYRILLIC CAPITAL LETTER YU}";
    $Schrift->{chr 224} = "\N{CYRILLIC SMALL LETTER YA}";
    $Schrift->{chr 232} = "\N{CYRILLIC CAPITAL LETTER E}";
    $Schrift->{chr 235} = "\N{CYRILLIC SMALL LETTER IO}";
    $Schrift->{chr 249} = "\N{CYRILLIC SMALL LETTER YU}";
  }
  return $Schrift
}

$sSchrift = get_schrift $slang;
$dSchrift = get_schrift $dlang;
$dbh->disconnect;

binmode STDIN, ':utf8';
binmode STDERR, ':utf8';
binmode STDOUT, ':utf8';
my($XEntry, $POSType, $genType, %currentSense, @senses);
while(<>)
{
  # get all the columns
  # "SELECT *" gave us too much
  # we only use $wEspKey
  my($wKeyNo, $wEspKey, $wXEntry, $wType, $wGType, $wFType, $wOmschr, $wFreq,
    $wVolgorde, $wOpm, $wOpm2, $wSortKey, $wUitspraak, $wlanguage, $xKeyNo,
    $xEspKey, $xXEntry, $xType, $xGType, $xFType, $xOmschr, $xFreq, $xVolgorde,
    $xOpm, $xOpm2, $xSortKey, $xUitspraak, $xlanguage) = split /\t/;
  unless(defined $wXEntry && defined $xXEntry)
  {
    print STDERR "Line $.: Empty headword or trans. Skipping: '",
      quotemeta $_, "'\n";
    next
  }
  $wXEntry =~ s/\s+$//;# mdb-sql exports with lots of trailing space
  $xXEntry =~ s/\s+$//;
  $wGType =~ s/\s+$//;
  #print STDERR "Line $.: wXEntry='$wXEntry' xXEntry='$xXEntry'\n";
  # collect <orth>, <pron>, <tr>s by <trans> and <sense>s
  if(not(defined $XEntry) || $wXEntry ne $XEntry
			   || $wType ne $POSType
			   || $wGType ne $genType)
  {
    if(defined $XEntry)
    {
      push @senses, { %currentSense };
      # output an <entry>
      formatEntry $XEntry, $POSType, $genType, @senses
    }
    # collect line for a new <entry>
    $XEntry = $wXEntry;
    $POSType = $wType;
    $genType = $wGType;
    @senses = ();
    $currentSense{'EspKey'} = $wEspKey;
    $currentSense{'transs'} = [ $xXEntry ]
  }
  else
  {
    if($currentSense{'EspKey'} == $wEspKey)
    {
      # add <trans> to current <sense>
      push @{ $currentSense{'transs'} }, $xXEntry
    }
    else
    {
      # start a new <sense>
      push @senses, { %currentSense };
      $currentSense{'EspKey'} = $wEspKey;
      $currentSense{'transs'} = [ $xXEntry ]
    }
  }
}

# output last <entry>
push @senses, { %currentSense };
formatEntry $XEntry, $POSType, $genType, @senses;

if($verbose)
{
  # Output $POSType and $genType statistics
  print STDERR "POSType\tCount\tName\n----------------------\n";
  for (sort keys %POSTypesCount)
  { print STDERR "$_\t", $POSTypesCount{$_}, "\t", $t2p{$_} || 'unknown', "\n" }

  print STDERR "genType\tCount\n------------------\n";
  for (sort keys %genTypesCount)
  { print STDERR "$_\t", $genTypesCount{$_}, "\n" }
}

__END__

=head1 NAME

ergane-records2tei - Convert output of a certain psql query into TEI XML

=head1 SYNOPSIS

ergane-records2tei [options] --source-language B<LA1> \
  --destination-language B<LA2> < `psql QUERY` > la1-la2-nophon.tei

=head1 DESCRIPTION

This program converts the result of a Postgresql query of a 'Woordenboek' table
extended by a 'language' column from STDIN (or @ARGV) into TEI format on
STDOUT.

This only emits <entry> elements.  The TEI header and footer have to be
provided extra.

=head1 OPTIONS

=over 8

=item B<--help>

Print this manual page and exit.

=item B<--source-language I<LA1>>

3-Letter-Code of the source language.  Mandatory argument.  The source language
is the language of the headwords of the dictionary.

=item B<--destination-language I<LA2>>

3-Letter-Code of the detination.  Mandatory argument.  The destination language
is the language of the translations of the dictionary.


=item B<--verbose>

Tell what I'm doing and print warnings.  Can be given twice for more verbosity.
Messages are printed to STDERR.

=back

=head1 HISTORY

This script differs from its predecessor F<ergane2tei.pl> by Horst Eyermann in
that it understands a different input.  It seems F<ergane2tei.pl>'s input had
been generated by Ergane 5.0's export function which is not included in the
current version 8.0 from L<http://download.travlang.com> (at least I have not
found it).  The website of version 5 is still available at
L<http://download.travlang.com/Ergane/download-en.html>, but the files not.

XXX what about web history site

=head1 AUTHOR AND LICENCE

Author: Michael Bunk, 2006-2007

This is free software, licensed under the GPL.

=cut

