caighdean.pl

#!/usr/bin/perl

use strict;
use warnings;
use utf8;
use Memoize;
use Storable;
use Redis;
use Encode qw(encode);

binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

my $verbose = 0;
my $unknowns = 0;
my $candidates = 0;
my $runtests = 0;
my $extension = '';

for my $a (@ARGV) {
	$verbose = 1 if ($a eq '-v');
	$unknowns = 1 if ($a eq '-u');
	$candidates = 1 if ($a eq '-c');
	$runtests = 1 if ($a eq '-t');
	$extension = '-gd' if ($a eq '-d');
	$extension = '-gv' if ($a eq '-x');
}

my $maxdepth = 10;
my $penalty = 2.9;
my $tokens = 0;
my $unknown = 0;

my @rules;
my %spurious;
my $candsref;
my $redis;

# Keys are strings containing last two processed words, whether
# flushed or not.  If we haven't flushed in a while, the key is usually
# simply the last two words in the hypothesis.
# We just need the last two since these are used to compute the
# most likely *next* word, which only depends on the previous two.
# The value corresponding to the two words is a hashref representing
# the *best* hypothesis with the given two final words.
# The hashref stores the running logprob of the hypothesis
# and an array containing all of the standardizations in the hypothesis...
# this could conceivably be quite long.
# entries in the array are hashrefs that look like:
# {'s' => 'bainríoghan', 't' => 'banríon'}
my %hypotheses;
$hypotheses{''} = {
	'logprob' => 0.0,
	'output' => [],
}; 

sub max {
	(my $a, my $b) = @_;
	return $a if ($a > $b);
	return $b;
}

sub extend_sentence {
	(my $s, my $w) = @_;
	return $w if ($s eq '');
	return "$s $w";
}

sub last_two_words {
	(my $s) = @_;
	if ($s =~ m/ /) {
		$s =~ m/([^ ]+ [^ ]+)$/;
		return $1;
	}
	else {
		return $s;
	}
}

# only used in verbose mode
sub hypothesis_output_string {
	(my $hyp) = @_;
	my $ans = '';
	for my $hr (@{$hyp->{'output'}}) {
		$ans .= $hr->{'t'}." ";
	}
	$ans =~ s/ $//;
	return $ans;
}

# all output is generated by this function; defines the standard
# output format in "pairs":
# Tha => Tá
# mi => mé
# ...
sub prep_for_output {
	(my $s, my $t) = @_;
	unless ($s eq $t) {
		$t = irishlc($t) unless ($s =~ m/_/ or $t =~ m/ /);
		$t = recapitalize($t, cap_style($s));
		unless ($t =~ m/_/) {
			$s =~ s/_\././g;   # m_._e_. => m._e.
			$s =~ s/(.)_/$1 /g;
		}
	}
	return $s.' => '.$t;
}

# Argument is a hypothesis (so a hashref with 'logprob' and 'output' keys)
sub hypothesis_pairs_string {
	(my $hyp) = @_;
	my $ans = '';
	for my $hr (@{$hyp->{'output'}}) {
		$ans .= prep_for_output($hr->{'s'}, $hr->{'t'})."\n";
	}
	return $ans;
}

# hard-coded N=3 here; "ngram" can come in as either a 1-gram or a 2-gram;
# this function tacks on $w at the end, but pushes off first word
# in case $ngram starts out as a 2-gram
sub shift_ngram {
	(my $ngram, my $w) = @_;
	my $ans = $ngram;
	$ans .= " $w";
	$ans =~ s/^[^ ]+ // if ($ngram =~ m/ /);
	return $ans;
}

# convert to title case (so leaves "an", etc. untouched)
# can pass a phrase, in which case just first word is affected
# or pass each individual word of a phrase (see $camelcase below)
# second arg is true iff $w == the full token
sub irishtc {
	(my $w, my $start_p) = @_;
	return $w if (!$start_p and $w =~ m/^(an|i|na)$/);
	$w =~ s/^mc(.)/"Mc".uc($1)/e;
	$w =~ s/^o'(.)/"O'".uc($1)/e;
	$w =~ s/^mb/mB/;
	$w =~ s/^gc/gC/;
	$w =~ s/^nd/nD/;
	$w =~ s/^bhf/bhF/;
	$w =~ s/^ng/nG/;
	$w =~ s/^bp/bP/;
	$w =~ s/^ts/tS/;
	$w =~ s/^dt/dT/;
	if ($w =~ m/^h([aeiouáéíóú].*)$/) {
		my $tail = $1;
		if ((!exists($candsref->{$tail}) and !exists($candsref->{ucfirst($tail)})) or $w =~ m/^h(aigh|allaí?|aló|ata?í?|údaí|ur|.)$/) {
			$w =~ s/^h/H/;  # halla -> Halla
		}
		else {
			$w =~ s/^h(.)/'h'.uc($1)/e;  # haimsire -> hAimsire
		}
	}
	$w =~ s/^([nt])-([aeiouáéíóú])/$1.uc($2)/e;
	unless ($w =~ /^[^ ]*\p{Lu}/) {  # if still no cap in first word...
		$w =~ s/^(['-]*)(.)/$1.uc($2)/e;
	}
	return $w;
}

sub recapitalize_one {
	(my $w, my $n) = @_;
	my $capital_p = $n % 2;
	$n = int($n / 2);
	my $firstcap_p = $n % 2;
	$n = int($n / 2);
	my $camelcase = $n % 2;
	$n = int($n / 2);
	my $allcaps = $n % 2;
	if ($capital_p) {
		if ($firstcap_p) {
			$w = irishtc($w,1);
		}
		else {
			$w =~ s/^([bdm]')([aeiouáéíóú])/$1.uc($2)/e;  # d'Éirinn
			$w =~ s/^(h-?)([aeiouáéíóú])/$1.uc($2)/e;  # hÉireann
			unless ($w =~ /^[^ ]*\p{Lu}/) {  # if still no cap in first word...
				$w = irishtc($w,1);
			}
		}
	}
	if ($camelcase) {
		if ($w =~ m/ /) {
			$w =~ s/ ([^ ]+)/" ".irishtc($1,0)/eg;
		}
		else {
			$w =~ s/-([^-]+)/"-".irishtc($1,0)/eg;
		}
	}
	if ($allcaps) {
		if ($w =~ m/\p{Ll}.*\p{Lu}/) {
			$w =~ s/^((?:\p{Ll}|['-])*\p{Lu})(.*)$/$1.uc($2)/e;
		}
		else {
			$w = uc($w);
		}
	}
	return $w;
}

sub recapitalize {
	(my $w, my $n) = @_;
	if ($n < 8) {
		$w = recapitalize_one($w, $n);
	}
	else {
		$w =~ s/([^ ]+)/recapitalize_one($1, $n)/eg;
	}
	return $w;
}

# 1st bit: on if "first" letter capitalized (ignoring eclipsis, etc.)
# 2nd bit: on if the actual first letter is capitalized (Tacht but not tAcht)
# 3rd bit: on if camel case; == cap after hyphen (except h-,n-,t-) *or* space
# (only examples where it's mixed are like "Bhaile-an-Easa" - rare)
# 4rd bit: on if all caps (at least 2) after initial eclipsis or whatever.  So:
# 0 = fear, bean, droch-cheann; ~90% of single word tokens in gd/gv/pre-std ga
# 1 = bhFear, h-Árd-rí, 'Sé
# 3 = Droch-chor, Fear, Bean
# 4 = sean-Mháirtín
# 5 = bhFíor-Ghaedhealtacht
# 7 = Nua-Eabhrac, Ard-Easbog
# 9 = gCNOC, h-AIMSIRE
# 11 = FEAR 
# 13 = mBÉAL-OIDEAS
# 15 = SEAN-GHAEDHEAL
# It's important that this work reasonably on pre-standard text,
# Gàidhlig, or Manx, so that, for example,
# h-Éireann is a "regular" capitalized word even w/ hyphen
# NB: input word can also be a MWE, with underscores in place of spaces!
sub cap_style {
	(my $w) = @_;
	my $ans = 0;
	$ans += 1 if ($w =~ m/^'*((([bdm]|dh)'|[hnt]-?)[AEIOUÁÉÍÓÚÀÈÌÒÙ]|m-?B|g-?C|n-?[DG]|bh-?F|b-?P|t-?S|d-?T|\p{Lu})/);
	$ans += 2 if ($w =~ m/^\p{Lu}/);
	$ans += 4 if ($w =~ m/^...*[_-]\p{Lu}/);
	my $allcap = 1;
	while ($w =~ m/([^_]+)/g) {
		my $chunk = $1;	# check each word in MWE
		if ($chunk !~ m/^'*(([hnt]-?)[AEIOUÁÉÍÓÚÀÈÌÒÙ]|m-?B|g-?C|n-?[DG]|bh-?F|b-?P|t-?S|d-?T)?(\p{Lu}|['-])*$/) {
			$allcap = 0;
			last;
		}
	}
	$ans += 8 if ($allcap and $w =~ /\p{Lu}.*\p{Lu}/);
	return $ans;
}

# same as model/tolow.pl
# handles single words and multi-word expressions
sub irishlc {
	(my $w) = @_;
	return $w if ($w =~ /^[<\\]/); # backslash for '\n' only
	$w =~ s/^([nt])([AEIOUÁÉÍÓÚ])/$1-$2/;
	$w =~ s/ ([nt])([AEIOUÁÉÍÓÚ])/ $1-$2/g;
	return lc($w);
}

# whatever we do to corpus in model/makefile we need to do here!
# unicode apostrophes already handled at STDIN
sub ngram_preprocess {
	(my $w) = @_;
	$w = irishlc($w);
	$w =~ s/^[0-9][0-9,.:]*$/<NUM>/;
	$w =~ s/^.+:\/\/.*$/<URI>/;
	$w =~ s/^.{70}.*$/<LONG>/;
	$w =~ s/^@[A-Za-z0-9_]+$/<USER>/;
	$w =~ s/^[A-Za-z0-9].*@.+$/<EMAIL>/;
	return $w;
}

# only called for n <= maximum stored in the precomputed lang model (usually 3)
# so generically, when called from compute_log_prob, we expect a string 
# say "X Y Z" as an arg, returns log P(Z | X Y).
# If arg is "X Y", we return P(Y | X), and for a word "X", P(X).
# When an ngram was not seen in training, we back off (recursion here)
sub compute_log_prob_helper {
	(my $ngram) = @_;
	my $ans = $redis->get(encode('utf8', $ngram));
	if (!defined($ans)) {
		if ($ngram =~ m/ /) {  # n>1
			my $start = $ngram;
			$start =~ s/ [^ ]+$//;
			my $tail = $ngram;
			$tail =~ s/^[^ ]+ //;
			$ans = compute_log_prob_helper($tail);
			$redis->select(1);
			my $smfactor = $redis->get(encode('utf8', $start));
			$redis->select(0);
			$ans += $smfactor if (defined($smfactor));
		}
		else {  # 1-gram
			$ans = $redis->get('<UNSEEN>');
			print STDERR "Warning: prob of unseen token not found in DB\n" unless (defined($ans));
		}
	}
	return $ans;
}

# conditional probability P(X|Y) of seeing k-gram $X
# (k generically == 1, but can be as big as biggest RHS in multi-xx)
# given preceding j-gram $Y (j is almost always == 2, except while
# processing the first couple of words of input, can be 0 or 1!)
# So "$Y $X" is what's in the source text...
sub compute_log_prob {
	(my $X, my $Y) = @_;
	my $ans = 0;
	$Y = '.' if ($Y eq '');
	while ($X =~ m/([^ ]+)/g) {
		my $w = $1;
		my $ngram = extend_sentence($Y, $w);
		$ans += compute_log_prob_helper($ngram);
		$Y = shift_ngram($Y, $w);
	}
	return $ans;
}

# takes a source language token (MWEs with underscores allowed)
# and returns hashref whose keys are # candidate translations and values
# the number of rules applied to get there.
# Second argument is there because it's recursive.
# Callers should call as: all_matches('focal', 0)
sub all_matches {
	(my $w, my $count) = @_;
	my %ans;
	return \%ans if ($count > $maxdepth);
	if (exists($candsref->{$w})) {
		for my $std (@{$candsref->{$w}}) {
			if ($std eq $w) {
				$ans{$std} = $count;
			}
			else {
				$ans{$std} = $count + 1;
			}
		}
	}
	# acceptable Irish only: 71ú, 6km, etc., as if added to clean.txt
	elsif ($w =~ m/^[0-9,.-]+([ckm]?[mgl]|[Ckúx]|0í|bn|msu|[kKMGTP]B)$/) {
		$ans{$w} = $count;
	}
	for my $rule (@rules) {
		my $p = $rule->{'patt'};
		if ($w =~ m/$p/) {
			my $r = $rule->{'repl'};
			my $cand = $w;
			$cand =~ s/$p/$r/eeg;
			my $subcount = $count;
			$subcount++ unless ($rule->{'level'} == -1);
			# recurse, even if rule results in MWE! Will recurse on
			# the "pieces" as a last resort below, if necessary
			my $subans = all_matches($cand, $subcount);
			for my $a (keys %{$subans}) {
				next if (exists($spurious{"$w $a"}));
				if (exists($ans{$a})) {  # if already found some other way
					$ans{$a} = $subans->{$a} if ($subans->{$a} < $ans{$a});
				}
				else {
					$ans{$a} = $subans->{$a};
				}
			}
			# rule produces multiword: oidhche-sin => oidhche_sin
			if ($cand =~ m/^([^_]+)_(.+)$/ and scalar keys %ans == 0) {
				my $left = $1;
				my $right = $2;
				my $subans_l = all_matches($left, $subcount);
				my $subans_r = all_matches($right, $subcount);
				for my $a (keys %{$subans_l}) {
					for my $b (keys %{$subans_r}) {
						$ans{"$a $b"} = max($subans_l->{$a}, $subans_r->{$b});
					}
				}
			}
		}
	}
	return \%ans;
}

sub load_databases {
	#print "Loading rules file...\n" if $verbose;
	open(RULES, "<:utf8", "rules$extension.txt") or die "Could not open spelling rules file: $!";
	while (<RULES>) {
		next if (/^#/);
		chomp;
		my %rule;
		m/^(\S+)\t(\S+)\t([0-9-]+)$/;
		$rule{'patt'} = qr/$1/;
		$rule{'level'} = $3;
		my $repl = $2;
		$repl =~ s/(.+)/"$1"/;
		$rule{'repl'} = $repl;
		push @rules, \%rule;
	}
	close RULES;

	#print "Loading spurious pairs...\n" if $verbose;
	open(SPURIOUS, "<:utf8", "spurious$extension.txt") or die "Could not open list of spurious pairs: $!";
	while (<SPURIOUS>) {
		chomp;
		$spurious{$_}++;
	}
	close SPURIOUS;

	$candsref = retrieve("cands$extension.hash");

	eval {$redis = Redis->new(encoding => undef);}; # default is 127.0.0.1:6379
	die "Unable to connect to Redis server" if $@;
}

# pass hashref in vs. using global %hypotheses
sub flush_best_hypothesis {
	(my $hashref) = @_;
	my $bestlogprob = -9999;
	my $bestkey;
	print "Flushing best of ".scalar(keys %{$hashref})." hypotheses\n" if ($verbose);
	for my $k (keys %{$hashref}) {
		if ($hashref->{$k}->{'logprob'} > $bestlogprob) {
			$bestlogprob = $hashref->{$k}->{'logprob'};
			$bestkey = $k;
		}
	}
	print "FLUSH:\n" if ($verbose);
	print hypothesis_pairs_string($hashref->{$bestkey});
	$hashref->{$bestkey} = {
		'logprob' => 0.0,
		'output' => [],
	}; 
}

sub process_ignorable_token {
	(my $tok) = @_;

	print "Processing ignorable: $tok\n" if $verbose;
	for my $two (keys %hypotheses) {
		push @{$hypotheses{$two}->{'output'}}, {'s' => $tok, 't' => $tok};
	}
}

sub process_one_token {
	(my $tok) = @_;

	$tokens++;
	my %newhypotheses;
	my $hashref = all_matches($tok, 0);
	my $unknown_p = (scalar keys %{$hashref} == 0);

	# if there were no matches in %{$candsref}, and none computed
	# by applying rules, then leave the token unchanged
	if ($unknown_p) {
		$hashref->{$tok} = 0;
		$unknown++;
		print "UNKNOWN: $tok\n" if $verbose;
		if ($unknowns) {
			print "$tok\n";
			delete $hashref->{$tok};
		}
	}
	else {
		if ($candidates) {
			for my $x (keys %{$hashref}) {
				print prep_for_output($tok, $x)."\n";
			}
		}
	}
	return if ($unknowns or $candidates);

	print "Input token = $tok\n" if $verbose;
	for my $x (keys %{$hashref}) {
		my $normalized_x = ngram_preprocess($x);
		print "Possible standardization: $x, normalized: $normalized_x\n" if $verbose;
		for my $two (keys %hypotheses) {
			my @newoutput = @{$hypotheses{$two}->{'output'}};
			push @newoutput, {'s' => $tok, 't' => $x};
			my $tail = extend_sentence($two, $normalized_x);
			my $candlogprob = compute_log_prob($normalized_x, $two);
			my %newhyp = (
				'logprob' => $hypotheses{$two}->{'logprob'} + $candlogprob - $penalty*$hashref->{$x},
				'output' => \@newoutput,
			);
			if ($verbose) {
				print "Created a new hypothesis (".$newhyp{'logprob'}."): ".hypothesis_output_string(\%newhyp)."\n";
				print "Computed from logprob of best hypothesis with key $two: ".$hypotheses{$two}->{'logprob'}."\n";
				print "Plus logprob of n-gram: $tail ($candlogprob)\n";
				print "Minus penalty $penalty times ".$hashref->{$x}."\n";
			}
			my $newtwo = last_two_words($tail);
			if (exists($newhypotheses{$newtwo})) {
				# need only keep the best among those ending w/ these two words
				if ($newhypotheses{$newtwo}->{'logprob'} < $newhyp{'logprob'}) {
					$newhypotheses{$newtwo} = \%newhyp;
					print "And it's the best so far ending in: $newtwo\n" if $verbose;
				}
				else {
					print "But not as good as (".$newhypotheses{$newtwo}->{'logprob'}."): ".hypothesis_output_string($newhypotheses{$newtwo})."\n" if $verbose;
				}
			}
			else {
				$newhypotheses{$newtwo} = \%newhyp;
				print "And it's the first (hence best) so far ending in: $newtwo\n" if $verbose;
			}
		}
	}

	# if there's only one hypothesis left, we can flush output and reset
	flush_best_hypothesis(\%newhypotheses) if (scalar keys %newhypotheses == 1);
	%hypotheses = %newhypotheses;

	if ($verbose) {
		print "Live hypotheses:\n";
		for my $two (keys %hypotheses) {
			print "Hypothesis with key '$two' (".$hypotheses{$two}->{'logprob'}."): ".hypothesis_output_string($hypotheses{$two})."\n";
		}
	}
	# when evaluating, don't want to memoize the fake answer for unknown tokens
	delete $hashref->{$tok} if ($verbose and $unknown_p);
}

sub normalize_apost_and_dash {
	(my $w) = @_;
	if ($w =~ m/[a-zA-ZáéíóúÁÉÍÓÚàèìòùÀÈÌÒÙ]/) {
		$w =~ s/[ʼ’]/'/g;
		$w =~ s/[‐−‑]/-/g;  # U+2010, U+2212, U+2011 to ASCII
	}
	return $w;
}

sub translate_stdin {
	print "Ready.\n" if $verbose;
	while (<STDIN>) {
		chomp;
		# skip SGML markup+newlines, only things to completely ignore in n-gram model
		# can match other "special" tokens with [:@&;=,.] which should all
		# remain unchanged, but are part of n-gram model
		if ($_ eq '\n' or /^<.+>$/) {
			process_ignorable_token($_);
		}
		elsif (/^['ʼ’]/ or /['ʼ’]$/) {
			my $w = normalize_apost_and_dash($_);
			if (exists($candsref->{$w}) or m/^['ʼ’]+$/ or
				($w =~ m/^'*[A-ZÁÉÍÓÚÀÈÌÒÙ]/ and exists($candsref->{lc($w)}))) {
				process_one_token($w);
			}
			else {
				m/^(['ʼ’]*)(.*[^'ʼ’])(['ʼ’]*)$/;
				process_one_token($1) if ($1 ne '');
				process_one_token(normalize_apost_and_dash($2)) if ($2 ne '');
				process_one_token($3) if ($3 ne '');
			}
		}
		else {
			process_one_token(normalize_apost_and_dash($_));
		}
	}

	flush_best_hypothesis(\%hypotheses) unless ($unknowns);

	if ($verbose) {
		print "Total tokens: $tokens\n";
		print "Unknown tokens: $unknown\n";
		if ($tokens > 0) {
			my $frac = $unknown / (1.0 * $tokens);
			print "Fraction unknown: $frac\n";
		}
	}
}

sub assert {
	(my $bool_expr, my $number, my $comment) = @_;
	if ($bool_expr) {
		print "ok $number - $comment\n" if $verbose;
	}
	else {
		print "not ok $number - $comment\n";
	}
	return $bool_expr;
}

sub run_unit_tests {
	my $testnum = 1;

	# test irishtc
	assert(irishtc('an',0) eq 'an',$testnum++,'irishtc of function word in camel case');
	assert(irishtc('na',0) eq 'na',$testnum++,'irishtc of function word in camel case');
	assert(irishtc('mbaile',0) eq 'mBaile',$testnum++,'irishtc of eclipsed word in camel case');
	assert(irishtc('mbaile',1) eq 'mBaile',$testnum++,'irishtc of eclipsed word');
	assert(irishtc('an',1) eq 'An',$testnum++,'irishtc of function word at start');
	assert(irishtc('mccartney',1) eq 'McCartney',$testnum++,'irishtc of Mc surname');
	assert(irishtc("o'reilly",1) eq "O'Reilly",$testnum++,'irishtc of O surname');
	assert(irishtc('an bhean',1) eq 'An bhean',$testnum++,'irishtc of a typical MWE');
	assert(irishtc('n-oileán',1) eq 'nOileán',$testnum++,'irishtc requiring dropped hyphen after n');
	assert(irishtc('t-aire',1) eq 'tAire',$testnum++,'irishtc requiring dropped hyphen after t');
	assert(irishtc('r-phost',1) eq 'R-phost',$testnum++,'irishtc where we do not drop hyphen');

	# test irishlc
	assert(irishlc('ABC') eq 'abc',$testnum++,'irishlc of all capital word');
	assert(irishlc('Gaeilge') eq 'gaeilge',$testnum++,'irishlc of typical capitalized word');
	assert(irishlc('bean') eq 'bean',$testnum++,'irishlc of already-lowercase word');
	assert(irishlc('nOileán') eq 'n-oileán',$testnum++,'irishlc requiring inserted hyphen');
	assert(irishlc('tAire') eq 't-aire',$testnum++,'irishlc requiring inserted hyphen after t');
	assert(irishlc('<P>') eq '<P>',$testnum++,'irishlc should not lowercase markup');
	assert(irishlc('<a href="http://example.com/Tfij45R">') eq '<a href="http://example.com/Tfij45R">',$testnum++,'irishlc should not lowercase any attributes either');
	assert(irishlc('Cósta Ríce') eq 'cósta ríce',$testnum++,'irishlc of MWE');
	assert(irishlc('Cré na nAspal') eq 'cré na n-aspal',$testnum++,'irishlc of MWE requiring inserted hyphen');
	assert(irishlc('a Ċaoiṁín') eq 'a ċaoiṁín',$testnum++,'irishlc of dotted consonants');

	# test prep_for_output (in lieu of cap_style, recapitalize separately)
	assert(prep_for_output('an','an') eq 'an => an',$testnum++,'prep_for_output of trivial pair');
	assert(prep_for_output(',',',') eq ', => ,',$testnum++,'prep_for_output of trivial pair, punctuation');
	assert(prep_for_output('\n','\n') eq '\n => \n',$testnum++,'prep_for_output of trivial pair, newline');
	assert(prep_for_output('Pádraig','Pádraig') eq 'Pádraig => Pádraig',$testnum++,'prep_for_output of trivial pair, already capitalized');
	assert(prep_for_output('mBaile','mBaile') eq 'mBaile => mBaile',$testnum++,'prep_for_output of trivial pair, eclipsed capital');
	assert(prep_for_output('ABC','ABC') eq 'ABC => ABC',$testnum++,'prep_for_output of trivial pair, allcaps');
	assert(prep_for_output('Ard-Chúirt','Ard-Chúirt') eq 'Ard-Chúirt => Ard-Chúirt',$testnum++,'prep_for_output of trivial pair, camel case');
	assert(prep_for_output('bPríomh-Acht','bPríomh-Acht') eq 'bPríomh-Acht => bPríomh-Acht',$testnum++,'prep_for_output of trivial pair, eclipsed camel case');
	assert(prep_for_output('fhíor-Éireannach','fhíor-Éireannach') eq 'fhíor-Éireannach => fhíor-Éireannach',$testnum++,'prep_for_output of trivial pair, camel case');
	assert(prep_for_output('31ú','31ú') eq '31ú => 31ú',$testnum++,'prep_for_output of trivial pair, letters and numbers');
	assert(prep_for_output('anbhfann','anbhann') eq 'anbhfann => anbhann',$testnum++,'prep_for_output of non-trivial pair, lower to lower (cap_style==0)');
	assert(prep_for_output('caidé','cad é') eq 'caidé => cad é',$testnum++,'prep_for_output of non-trivial pair, lower to lower, multiword target (cap_style==0)');
	assert(prep_for_output('do_bhí','bhí') eq 'do bhí => bhí',$testnum++,'prep_for_output of non-trivial pair, lower to lower, multiword source (cap_style==0)');
	assert(prep_for_output('do_réir','de réir') eq 'do réir => de réir',$testnum++,'prep_for_output of non-trivial pair, lower to lower, multiword source and target (cap_style==0)');
	assert(prep_for_output("'sa","sa") eq "'sa => sa",$testnum++,'prep_for_output of non-trivial pair, lower to lower, initial apost (still cap_style==0)');
	assert(prep_for_output("4adh","4ú") eq "4adh => 4ú",$testnum++,'prep_for_output of non-trivial pair, lower to lower, initial digit (still cap_style==0)');
	assert(prep_for_output('mBolg','mbolg') eq 'mBolg => mBolg',$testnum++,'prep_for_output of non-trivial pair, mutated upper, lowercase target (cap_style==1)');
	assert(prep_for_output("d'Abrán","d'Aibreán") eq "d'Abrán => d'Aibreán",$testnum++,"prep_for_output of non-trivial pair, d' + capital to same in target, compare D'imthigh below (cap_style==1)");
	assert(prep_for_output("dh'Arm","d'arm") eq "dh'Arm => d'Arm",$testnum++,"prep_for_output of non-trivial pair, treat Gàidhlig dh' like d', m', etc. (cap_style==1)");
	assert(prep_for_output('tAthair','t-athair') eq 'tAthair => tAthair',$testnum++,'prep_for_output of non-trivial pair, mutated upper, lowercase target requiring hyphen be dropped (cap_style==1)');
	assert(prep_for_output('g-Craoibh','gcraobh') eq 'g-Craoibh => gCraobh',$testnum++,'prep_for_output of non-trivial pair, mutated upper with hyphen, lowercase target (cap_style==1)');
	assert(prep_for_output('gConnradh','gConradh') eq 'gConnradh => gConradh',$testnum++,'prep_for_output of non-trivial pair, mutated upper in source and target (cap_style==1)');
	assert(prep_for_output('h-Iùdhaich','Giúdaigh') eq 'h-Iùdhaich => Giúdaigh',$testnum++,'prep_for_output of non-trivial pair, mutated upper in source and capitalized target not needing mutation (cap_style==1)');
	assert(prep_for_output("'Steach","isteach") eq "'Steach => Isteach",$testnum++,'prep_for_output of non-trivial pair, initial apost+upper in source to lowercase (still cap_style==1)');
	assert(prep_for_output('Shiubhail','shiúil') eq 'Shiubhail => Shiúil',$testnum++,'prep_for_output of non-trivial pair, simple capital source, lower target (cap_style==3)');
	assert(prep_for_output('Bhfuil','bhfuil') eq 'Bhfuil => bhFuil',$testnum++,'prep_for_output of non-trivial pair, bad capitalization of eclipsis in source, lower target (cap_style==3)');
	assert(prep_for_output('Dtáinic','dtáinig') eq 'Dtáinic => dTáinig',$testnum++,'prep_for_output of non-trivial pair, bad capitalization of eclipsis in source, lower target (cap_style==3)');
	assert(prep_for_output('House','house') eq 'House => House',$testnum++,'prep_for_output of non-trivial pair, simple capital H in source, lower target, hOuse would be bad (cap_style==3)');
	assert(prep_for_output('Halla','halla') eq 'Halla => Halla',$testnum++,'prep_for_output of non-trivial pair, simple capital H in source, lower target, hAlla would be odd even though alla is an Ir. word (cap_style==3)');
	assert(prep_for_output('Sìde','haimsire') eq 'Sìde => hAimsire',$testnum++,'prep_for_output of non-trivial pair, simple capital in source, h+lower in target which we recognize as Irish (cap_style==3)');
	assert(prep_for_output('Nerin','hÉireann') eq 'Nerin => hÉireann',$testnum++,'prep_for_output of non-trivial pair, simple capital in source, h+capital in target which we want to keep (cap_style==3)');
	assert(prep_for_output("D'imthigh","d'imigh") eq "D'imthigh => D'imigh",$testnum++,"prep_for_output of non-trivial pair, simple capital D' to lower, compare d'Abrán above (cap_style==3)");
	assert(prep_for_output('Do_dhein','rinne') eq 'Do dhein => Rinne',$testnum++,'prep_for_output of non-trivial pair, simple capital multiword source, lower target (cap_style==3)');
	assert(prep_for_output('Ghlanas-sa','ghlan mise') eq 'Ghlanas-sa => Ghlan mise',$testnum++,'prep_for_output of non-trivial pair, simple capital source, lowercase multiword target (cap_style==3)');
	assert(prep_for_output('Didomhnaich','Dé Domhnaigh') eq 'Didomhnaich => Dé Domhnaigh',$testnum++,'prep_for_output of non-trivial pair, simple capital source, already capitalized multiword in target, bugfix 2016-06-17 (cap_style==3)');
	assert(prep_for_output('I_n-aice','in aice') eq 'I n-aice => In aice',$testnum++,'prep_for_output of non-trivial pair, capital multiword source, lowercase multiword target (cap_style==3)');
	assert(prep_for_output('Sasanaighibh','Sasanaigh') eq 'Sasanaighibh => Sasanaigh',$testnum++,'prep_for_output of non-trivial pair, simple capital source, target already capitalized (cap_style==3)');
	assert(prep_for_output('hAon-Mhac','haonmhac') eq 'hAon-Mhac => hAonmhac',$testnum++,'prep_for_output of non-trivial pair, mutated camel case in source, lowercase target no hyphens (cap_style==5)');
	assert(prep_for_output('hAon-Dhéag','haon déag') eq 'hAon-Dhéag => hAon Déag',$testnum++,'prep_for_output of non-trivial pair, mutated camel case in source, lowercase target multiword (cap_style==5)');
	assert(prep_for_output('Sean-Nós','sean-nós') eq 'Sean-Nós => Sean-Nós',$testnum++,'prep_for_output of non-trivial pair, camel case hyphenated word in source, hyphenated word in target with pieces needing capitalization (cap_style==7)');
	assert(prep_for_output('Mheadhon-Lae','mheán lae') eq 'Mheadhon-Lae => Mheán Lae',$testnum++,'prep_for_output of non-trivial pair, camel case multiword in source, target multiword (cap_style==7)');
	assert(prep_for_output('Lhie_ny_Greiney','luí na gréine') eq 'Lhie ny Greiney => Luí na Gréine',$testnum++,'prep_for_output of non-trivial pair, camel case multiword in source, target multiword including article unchanged in titlecase (cap_style==7)');
	assert(prep_for_output('hACHTANNA','hachtanna') eq 'hACHTANNA => hACHTANNA',$testnum++,'prep_for_output of non-trivial pair, source mutated allcaps, target mutated lowercase (cap_style==9)');
	assert(prep_for_output('nIRISLEABHAR','n-irisleabhar') eq 'nIRISLEABHAR => nIRISLEABHAR',$testnum++,'prep_for_output of non-trivial pair, source mutated allcaps, target mutated lowercase with hyphen that we must drop (cap_style==9)');
	assert(prep_for_output('NAOMHTHA','naofa') eq 'NAOMHTHA => NAOFA',$testnum++,'prep_for_output of non-trivial pair, source allcaps, target in lowercase (cap_style==11)');
	assert(prep_for_output('GCONNTAE','gcontae') eq 'GCONNTAE => gCONTAE',$testnum++,'prep_for_output of non-trivial pair, source allcap including eclipsis, target lowercase (cap_style==11)');
	assert(prep_for_output('SÉAGHAINÍN','Seáinín') eq 'SÉAGHAINÍN => SEÁINÍN',$testnum++,'prep_for_output of non-trivial pair, source allcaps, target already titlecase (cap_style==11)');
	assert(prep_for_output('AN_nGEOBHADH','an bhfaigheadh') eq 'AN nGEOBHADH => AN bhFAIGHEADH',$testnum++,'prep_for_output of non-trivial pair, recognize source as allcaps despite eclipsis on subsequent word, multiword lowercase target (cap_style==11)');
	assert(prep_for_output('gCRUADH-CHÁS','gcruachás') eq 'gCRUADH-CHÁS => gCRUACHÁS',$testnum++,'prep_for_output of non-trivial pair, source mutated allcap, hyphenated, target lowercase (cap_style==13)');
	assert(prep_for_output('tSEAN-AIMSIR','tseanaimsir') eq 'tSEAN-AIMSIR => tSEANAIMSIR',$testnum++,'prep_for_output of non-trivial pair, source mutated allcap, hyphenated, target lowercase (cap_style==13)');
	assert(prep_for_output('SO-BHLASTA','so-bhlasta') eq 'SO-BHLASTA => SO-BHLASTA',$testnum++,'prep_for_output of non-trivial pair, source allcap with hyphen, target lowercase with hyphen (cap_style==15)');
	assert(prep_for_output('BPRIMH-CHISTE','bpríomhchiste') eq 'BPRIMH-CHISTE => bPRÍOMHCHISTE',$testnum++,'prep_for_output of non-trivial pair, source allcap including eclipsis and hyphen, target lowercase (cap_style==15)');

# test extend_sentence
	assert(extend_sentence('','amach') eq 'amach',$testnum++,'extend_sentence is called with empty first arg iff at start of input');
	assert(extend_sentence('amach','le') eq 'amach le',$testnum++,'extend_sentence is called with single word as first arg only near start of input');
	assert(extend_sentence('amach le','mo') eq 'amach le mo',$testnum++,'extend_sentence, as called generically');

# test last_two_words
	assert(last_two_words('duine') eq 'duine',$testnum++,'last_two_words called with single-word argument iff at start of input');
	assert(last_two_words('duine beag') eq 'duine beag',$testnum++,'last_two_words called with two-word argument iff near start of input');
	assert(last_two_words('duine beag gránna') eq 'beag gránna',$testnum++,'generic three-word input to last_two_words');
	assert(last_two_words('. uaireanta i mo') eq 'i mo',$testnum++,'last_two_words can be called with > three-word argument if we append MWE from pairs-xx.txt');

# test shift_ngram
	assert(shift_ngram('.','amach') eq '. amach',$testnum++,'first arg shift_ngram is artifically set to a fullstop at start of input, never empty');
	assert(shift_ngram('amach','le') eq 'amach le',$testnum++,'second call to shift_ngram near start of input looks like this');
	assert(shift_ngram('amach le','fear') eq 'le fear',$testnum++,'generic call to shift_ngram');

# test ngram_preprocess
	assert(ngram_preprocess('0') eq '<NUM>',$testnum++,'ngram_preprocess converts simple one-digit number to <NUM>');
	assert(ngram_preprocess('00034') eq '<NUM>',$testnum++,'ngram_preprocess converts simple number to <NUM> even with leading zeroes');
	assert(ngram_preprocess('1991') eq '<NUM>',$testnum++,'ngram_preprocess converts simple number to <NUM>');
	assert(ngram_preprocess('3,219') eq '<NUM>',$testnum++,'ngram_preprocess converts numbers with commas to <NUM>');
	assert(ngram_preprocess('20:01:43') eq '<NUM>',$testnum++,'ngram_preprocess converts hh:mm::ss to <NUM>');
	assert(ngram_preprocess('12:25') eq '<NUM>',$testnum++,'ngram_preprocess converts hh:mm to <NUM>');
	assert(ngram_preprocess('1:1000') eq '<NUM>',$testnum++,'ngram_preprocess converts ratios to <NUM>');
	assert(ngram_preprocess('1.79') eq '<NUM>',$testnum++,'ngram_preprocess converts numbers with decimals to <NUM>');
	assert(ngram_preprocess('3.5.8') eq '<NUM>',$testnum++,'ngram_preprocess converts stuff that looks like version numbers to <NUM>');
	assert(ngram_preprocess('-' x 69) eq ('-' x 69),$testnum++,'ngram_preprocess does not convert 69 character token');
	assert(ngram_preprocess('-' x 70) eq '<LONG>',$testnum++,'ngram_preprocess converts 70 character token to <LONG>');
	assert(ngram_preprocess('-' x 200) eq '<LONG>',$testnum++,'ngram_preprocess converts 200 character token to <LONG>');
	assert(ngram_preprocess('http://igaeilge.wordpress.com/?p=351#comment-389') eq '<URI>',$testnum++,'ngram_preprocess converts http URLs to <URI>');
	assert(ngram_preprocess('https://vimeo.com/01234567') eq '<URI>',$testnum++,'ngram_preprocess converts https URLs to <URI>');
	assert(ngram_preprocess('ftp://alpha.gnu.org/gnu/bison/') eq '<URI>',$testnum++,'ngram_preprocess converts ftp URLs to <URI>');
	assert(ngram_preprocess('http://fiontardcu.wordpress.com/2013/04/02/maria-ni-shuilleabhain-msc-i-ngno-i-dteicneolaiocht-an-eolais/') eq '<URI>',$testnum++,'ngram_preprocess converts very long URLs to <URI>, not <LONG>');
	assert(ngram_preprocess('@kscanne') eq '<USER>',$testnum++,'ngram_preprocess converts Twitter usernames to <USER>');
	assert(ngram_preprocess('@KScanne') eq '<USER>',$testnum++,'ngram_preprocess converts Twitter usernames to <USER>, case insensitive');
	assert(ngram_preprocess('@leaders_indig') eq '<USER>',$testnum++,'ngram_preprocess converts Twitter usernames with underscores to <USER>');
	assert(ngram_preprocess('kscanne@example.com') eq '<EMAIL>',$testnum++,'ngram_preprocess converts (simple) email addresses to <EMAIL>');
	assert(ngram_preprocess('k.scannell.1@example123.ac.uk') eq '<EMAIL>',$testnum++,'ngram_preprocess converts (simple) email addresses to <EMAIL>');

# test normalize_apost_and_dash
	assert(normalize_apost_and_dash('t‐asal') eq 't-asal',$testnum++,'normalize_apost_and_dash converts 2010 dash to ASCII');
	assert(normalize_apost_and_dash('‐') eq '‐',$testnum++,'normalize_apost_and_dash leaves 2010 dash unchanged when it is a token');
	assert(normalize_apost_and_dash('an‑álainn') eq 'an-álainn',$testnum++,'normalize_apost_and_dash converts 2011 dash to ASCII');
	assert(normalize_apost_and_dash('‑') eq '‑',$testnum++,'normalize_apost_and_dash leaves 2011 dash unchanged when it is a token');
	assert(normalize_apost_and_dash('an−álainn') eq 'an-álainn',$testnum++,'normalize_apost_and_dash converts 2212 dash to ASCII');
	assert(normalize_apost_and_dash('−') eq '−',$testnum++,'normalize_apost_and_dash leaves 2212 dash unchanged when it is a token');
	assert(normalize_apost_and_dash('bʼiúd') eq "b'iúd",$testnum++,'normalize_apost_and_dash converts 02BC to ASCII apostrophe');
	assert(normalize_apost_and_dash('ʼsa') eq "'sa",$testnum++,'normalize_apost_and_dash converts even initial 02BC to ASCII apostrophe');
	assert(normalize_apost_and_dash('arsʼ') eq "ars'",$testnum++,'normalize_apost_and_dash converts even final 02BC to ASCII apostrophe');
	assert(normalize_apost_and_dash('má’s') eq "má's",$testnum++,'normalize_apost_and_dash converts 2019 to ASCII apostrophe');
	assert(normalize_apost_and_dash('’fhios') eq "'fhios",$testnum++,'normalize_apost_and_dash converts even initial 2019 to ASCII apostrophe');
	assert(normalize_apost_and_dash('dob’') eq "dob'",$testnum++,'normalize_apost_and_dash converts even final 2019 to ASCII apostrophe');

# test compute_log_prob
	assert(compute_log_prob('cónaí','sí ina') > compute_log_prob('chónaí','sí ina'),$testnum++,'"cónaí" more likely than "chónaí" after "sí ina"');
	assert(compute_log_prob('ar','ag filleadh') > compute_log_prob('do','ag filleadh'),$testnum++,'"ar" more likely than "do" after "ag filleadh"');
	assert(compute_log_prob('héireann','poblacht na') > compute_log_prob('éireann','poblacht na'),$testnum++,'prefix-h more likely than not after "poblacht na"');
	assert(compute_log_prob('cliath','baile átha') > compute_log_prob('luain','baile átha'),$testnum++,'bác more popular than bál');
	assert(compute_log_prob('bhí','') > compute_log_prob('raibh',''),$testnum++,'bhí more likely than raibh at start of input');
	assert(compute_log_prob('tá','') > compute_log_prob('bhfuil',''),$testnum++,'tá more likely than bhfuil at start of input');
	assert(compute_log_prob('tar éis briseadh','go díreach') > compute_log_prob('ar bhriseadh','go díreach'),$testnum++,'choose correct translation of gd "air_briseadh" in context, note MWE is possible as first argument');
	assert(compute_log_prob('ar bhriseadh','smaoineamh') > compute_log_prob('tar éis briseadh','smaoineamh'),$testnum++,'choose correct translation of gd "air_briseadh" in context, again');

	$testnum--;
	print "1..$testnum\n" if $verbose;
}

if ($runtests) {
	load_databases(); # needed to test recapitalizaton of h+vowel
	run_unit_tests();
}
else {
	memoize('compute_log_prob');
	memoize('all_matches');
	load_databases();
	translate_stdin();
}

exit 0;