#!/usr/bin/perl
# see README below
my $VERSION = '$Revision: 1.2 $';
#
# Note: use perl always with warnings and strict to prevent a lot of problems!
#
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
use Net::Dict;
my $host = 'localhost';
my $port = '2628';
my $database = 'foldoc';
my $prompt = "define> ";
my $dict;
my $eref;
my $entry;
my $db;
my $definition;
my %glossaryterms = ();
my $title = '';
my $opt_x = 1; # default on, see GetOptions
my $opt_d = 0; # test only! #FIXME#
my $opt_v = 0; # test only! #FIXME#
my $opt_b = 1; # default on, create a valid docbook
my $opt_C = 1; # default on (Correct) disable attempted spelling correction
# i.e.: use strategy exact only
my $strategy = 'exact'; #default for matching or defining
my $man = 0;
my $help = 0;
my $interactive = 0;
#
# Parse options and print usage if there is a syntax error,
# or if usage was explicitly requested.
#
# Getopt::Long supports two useful variants of simple options:
# negatable options and incremental options.
# A negatable option is specified with a exclamation mark "!"
# after the option name.
#
GetOptions(
'help' => \$help,
'man' => \$man,
'xml!' => \$opt_x,
'Correct!' => \$opt_C,
'strategy=s' => \$strategy,
'book!' => \$opt_b,
'verbose+' => \$opt_v,
'host=s' => \$host,
'port=s' => \$port,
'database=s' => \$database,
'interactive!' => \$interactive,
'debug!' => \$opt_d,
)
or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;
## If no arguments were given, then allow STDIN to be used only
## if it's not connected to a terminal (otherwise print usage)
pod2usage("$0: No files given.")
if ((@ARGV == 0) && (!$interactive) && (-t STDIN));
#-----------------------------------------------------------------------
# Turn off buffering on STDOUT
#-----------------------------------------------------------------------
$| = 1;
#-----------------------------------------------------------------------
# Create instance of Net::Dict, connecting either to a user-specified
# DICT server, or defaulting to dict.org
#-----------------------------------------------------------------------
## $host = @ARGV > 0 ? shift @ARGV : 'dict.org';
warn "Connecting to $host ..." if $opt_v;
$dict = Net::Dict->new(
$host,
Port => $port,
Client => "foldoc2docbook $VERSION",
### Timeout => 120,
Debug => $opt_d
)
or die;
warn "\n" if $opt_v;
sub main
{
my $glossterm = '';
#-----------------------------------------------------------------------
# This is the "SHOW DATABASES" command from RFC 2229.
#-----------------------------------------------------------------------
if ($opt_v)
{
my %dbhash = $dict->dbs();
warn "Available dictionaries:\n";
while (($db, $title) = each %dbhash)
{
warn "$db : $title\n" if defined $title;
}
}
$title = $dict->dbTitle($database);
if (!$title)
{
$database = 'wn' if $title = $dict->dbTitle('wn');
if (!$title)
{
$database = '*' if $title = $dict->dbTitle('*');
}
}
$dict->setDicts($database);
#-----------------------------------------------------------------------
# This returns a string, containing description of the dictionary $database.
#-----------------------------------------------------------------------
if ($opt_d and $opt_v)
{
my $dbInfo = $dict->dbInfo($database);
warn "Used dictionary:\n";
warn $dbInfo;
#
# NOTE: uc() to support case insensitive sort! ck
#
$glossaryterms{uc('00-database-info')} = [[$database, $dbInfo]]
if $opt_x;
#
# FIXME# after the dbInfo() call, the first define() call returns an empty reference or hang?
#
$eref = $dict->define($database);
}
$glossaryterms{uc($database)} =
[[$database, "$database\n \{$title\}\n\n"]] # FIXME should end with '\n\n\n'
if $opt_x;
#-----------------------------------------------------------------------
# Let the user repeatedly enter words, which we then look up.
#-----------------------------------------------------------------------
print $prompt if $interactive;
while (<>)
{
chomp; # avoid \n on last field
#TBD#prevent non-alphanumeric chars:
# $_ =~ s/[^\w\d_]+/_/gmo;
$_ =~ s/[\/_-]+/ /gmo;
$_ =~ s/\s[\s]+/ /gmo;
if (not $_ or defined $glossaryterms{uc($_)})
{
print $prompt if $interactive;
next;
}
#-------------------------------------------------------------------
# The define() method returns an array reference.
# The array has one entry for each definition found.
# If the referenced array has no entries, then there were no
# definitions in any of the dictionaries on the server.
#-------------------------------------------------------------------
### we need the glossterm $eref = $dict->define($_);
### if (@$eref == 0)
{
#-------------------------------------------------------------------
# The match() method returns an array reference.
# You can lookup a list of words which match a given pattern, using
# the match() method. Each DICT server typically supports a number of
# strategies which can be used to match words against a pattern.
#-------------------------------------------------------------------
$eref = $dict->match($_, $strategy);
if (@$eref == 0 and not $opt_C)
{
warn " use strategie 'lev' for \"$_\"\n" if $opt_v;
$eref = $dict->match($_, 'lev');
if (@$eref == 0)
{
warn " use strategie 'prefix' for \"$_\"\n" if $opt_v;
$eref = $dict->match($_, 'prefix');
}
}
if (@$eref != 0)
{
#---------------------------------------------------------------
# Each entry is another array reference. The referenced array
# for each entry has two elements:
# $db - the name of the database (ie dictionary)
# $definition - the definition mached the request
#---------------------------------------------------------------
foreach $entry (@$eref)
{
($db, $glossterm) = @$entry;
$eref = $dict->define($glossterm, $db);
warn " found '$glossterm' in \"$db\"\n" if $opt_v;
last; # Note: we need only one definition! ck
}
}
}
if (@$eref == 0)
{
warn " no definition for \"$_\"\n";
}
else
{
#---------------------------------------------------------------
# Each entry is another array reference. The referenced array
# for each entry has two elements:
# $db - the name of the database (ie dictionary)
# $definition - the text of the definition
#---------------------------------------------------------------
foreach $entry (@$eref)
{
($db, $definition) = @$entry;
next if not defined $db and $opt_d;
if ($opt_d or $interactive)
{
$title = $dict->dbTitle($db);
warn " found '$glossterm' in \"$title\"\n" if (defined $title and $opt_v);
warn "\n-------------------------------------------\n",
$definition;
}
if ($opt_x)
{
#
# this is a new glossterm (a foldoc definition):
#
# NOTE: uc() to support case insensitive sort! ck
#TBD#prevent non-alphanumeric chars:
# $glossterm =~ s/[^\w\d_]+/_/gmo;
$glossterm =~ s/[\/_-]+/ /gmo;
$glossterm =~ s/\s[\s]+/ /gmo;
$glossaryterms{uc($glossterm)} = $eref;
warn "found '$glossterm'" if $opt_v;
last; # Note: we need only one definition! ck
}
}
}
print $prompt if $interactive;
}
if ($opt_x)
{
print_prolog();
# FIXME# should be sorted like sort -ubf
## foreach $glossterm ( sort { uc$a cmp uc$b } keys %glossaryterms)
foreach $glossterm (sort keys %glossaryterms)
{
$eref = $glossaryterms{$glossterm};
#---------------------------------------------------------------
# Each entry is another array reference. The referenced array
# for each entry has two elements:
# $db - the name of the database (ie dictionary)
# $definition - the text of the definition
#---------------------------------------------------------------
#FIXME# ($db, $definition) = $eref[0]->@$entry;
foreach $entry (@$eref)
{
($db, $definition) = @$entry;
$title = $dict->dbTitle($db);
print_xmlout($db, $definition);
last; # Note: we need only one definition! ck
}
}
print_epilog();
}
}
sub print_prolog
{
print "\n";
print "\n"
if $opt_b;
print "\n" if $opt_b;
print " \n";
print " \n";
print " \n";
print " \n";
print "*** Source: $title ***\n" if defined $title;
print " \n";
print " \n";
print " \n";
}
sub print_epilog
{
print " \n";
print "\n" if $opt_b;
}
sub print_xmlout
{
my($db, $data) = @_;
$data = $data."\n";
my $in_glossdef = 0;
my $nextline = "";
### warn $data; return;
$data =~ s/\r\n/\n/gmo;
$data =~ s/\n\n\n/\n.\n/gmo; # mark the end of a foldoc definition! ck
#TBD#
$data =~ s/[\t ]{2+}/ /gmo; # prevent to many spaces in a paragraph! ck
$data =~ s/\n\n/\n \n/gmo; # prevent empty lines! ck
my @data = split(/\n/mo, $data);
@data = reverse @data;
#TBD# foreach my $line (@data)
# Note: an empty line breaks this while loop, but we need it yet! ck
while (my $line = pop @data)
{
if ($in_glossdef and $nextline = pop @data)
{
# read lines while in paragraph
while ($nextline =~ /^\s+\S+/mo) #
{
$line = $line . $nextline;
$nextline = pop @data;
}
# FIXME#this append at the end: push @data, $nextline;
## warn "keep in mind: '$nextline'";
warn "server sends: $line\n" if ($opt_v > 1);
}
{ # xml (docbook) part
#XXX# warn "server sends: $line\n" if ($opt_v > 1);
if ($in_glossdef and $line =~ /^\.\s*$/mo)
{
#
# Note: this match not in a multi line string!
#
warn "this is the END of a foldoc definition:";
#
print " \n";
print " \n";
print " \n";
print "\n";
$in_glossdef = 0;
}
elsif ($line =~ /^\s+$/mo)
{
#
# FIXME Note: this match not in a multi line string!
#
### warn "this is a paragraph of a foldoc definition:";
#
### print " \n";
### print " \n";
### print " \n";
}
elsif ($line =~ /^\w+.*$/mo)
{
#
# this is a new glossterm (a foldoc definition):
#
# NOTE: uc() to support case insensitive sort! ck
my $anchor = uc($line);
# NOTE: prevent non-alphanumeric chars:
$anchor =~ s/[\/_-]+/ /gmo;
$anchor =~ s/[^\w\d_]+/_/gmo;
$anchor =~ s/_[_]+/_/gmo;
print " \n";
print
" $line\n";
print " \n";
print " \n";
$in_glossdef = 1;
}
else
{
#==================================================
# Note: the order is very important! ck
#==================================================
#
# this is a subject classification (from a foldoc definition):
#
## $line =~ s#(.*?)<(.+?)>(.*?)#$1$3#gmo;
#
# quote the xml keys:
#
if (
!( $line =~
s#<\b(\S+@\S+)\b>#$1#gmo
)
)
{
$line =~ s#\\&#gmo;
$line =~ s#<#\<#gmo;
$line =~ s#>#\>#gmo;
$line =~ s#"#\"#gmo;
}
#
# look for a single acronym with uc chars and mark them as glossterm
#
# $line =~ s#(.+?\()(.+?)([,)][^}].*?)#omg$1$2$3#gmo;
# FIXME: dosn't match this! (WWW, W3, The Web) or (URL, previously "Universal")
$line =~
s#(.*?)([(])(\s*?)([A-Z][A-Z0-9]+?)(,\s*|[)])(.*?)#$1$2$3$4$5$6#gmo;
#
# create link reference to other glossterm, but only if they are included in this glossary!
#
## Note: this match only for first cross-references in a line:
while ($line =~ /([^{\\]*?)\{([^)}]+?)\}(.*)/mo)
{
my $prev = $1;
my $linkterm = $2;
my $next = $3;
#TBD#prevent non-alphanumeric chars:
# $linkterm =~ s/[^\w\d_]+/_/gmo;
$linkterm =~ s/[\/_-]+/ /gmo;
$linkterm =~ s/\s[\s]+/ /gmo;
# NOTE: uc() to support case insensitive sort! ck
my $linkend = uc($linkterm);
if ($glossaryterms{$linkend})
{
# NOTE: prevent non-alphanumeric chars:
$linkend =~ s/[^\w\d_]+/_/gmo;
$linkend =~ s/_[_]+/_/gmo;
warn "set link to $linkend" if ($opt_v > 1);
$line =
"$prev$linkterm$next";
}
else
{
warn "### Not found: $linkterm" if $opt_v;;
$line = "$prev$linkterm$next";
}
}
#
# all other glossterm not included in this glossary!
#
{
warn "### The rest: $2" if
$line =~
s#([^{\\]*?)\{([^)}]+?)\}(.*?)#$1$2$3#gmo;
#TBD#
$line =~
s#(\s+)\bAddress:(.+?\.)(\s*?)#$1$2$3#gmo;
$line =~
s#(\s+)\bTelephone:(.+?\.)(\s*?)#$1$2$3#gmo;
#
# foldoc change notice in ISO formart YYYY-MM-DD like: (1996-11-03)
#
$line =~
s#(.*?)(\(\d{4}.\d{2}.\d{2}\))(.*?)#$1$db$2$3#gmo;
}
#
# this is an external reference (from a foldoc definition):
# FIXME: {Home (http://www.ieee.org/)}. {Gopher (gopher://gopher.ieee.org/)}. {(ftp://ftp.ieee.org/)}.
# Home . Gopher (gopher://gopher.ieee.org/)}. {.
# from 'Institute of Electrical and Electronics Engineers, Inc.'
#
$line =~
s#([^{\\]*?)\{([^(]*?)\((http:\S+?)\)\}(.*?)#$1$2$4#gmo;
$line =~
s#([^{\\]*?)\{([^(]*?)\((ftp:\S+?)\)\}(.*?)#$1$2$4#gmo;
$line =~
s#([^{\\]*?)\{([^(]*?)\((gopher:\S+?)\)\}(.*?)#$1$2$4#gmo;
$line =~
s#([^{\\]*?)\{([^(]*?)\((telnet:\S+?)\)\}(.*?)#$1$2$4#gmo;
$line =~
s#([^{\\]*?)\{([^(]*?)(news:\S+?)\}(.*?)#$1$2$4#gmo;
print "$line\n";
} # end xml (docbook) part
}
# FIXME# it seems $nextline is not used if we use forech loop? ck
if ($nextline =~ /^\s+$/mo)
{
#
warn "this is a paragraph of a foldoc definition:" if ($opt_v > 1);
#
print " \n";
print " \n";
print " \n";
}
elsif ($in_glossdef and $nextline =~ /^\.\s*$/mo)
{
#
warn "this is the END of a foldoc definition:" if ($opt_v > 1);
#
print " \n";
print " \n";
print " \n";
print "\n";
$in_glossdef = 0;
}
}
}
main();
######################### end of code part #########################
__END__
=head1 NAME
foldoc2docbook - build a B glossary for items requested from a DICT server
=head1 DESCRIPTION
This is a simple script which illustrates use of the
C perl module to get items from a DICT server.
If no hostname is given, then the default DICT server C is used.
In B mode, each item read from I or I
will be requested from the desired DICT server and the output is
transformed to a B glossary form and printed to I.
The I dictionary is used if available, or the I dictionary.
If neither (of both) is available, all available other dictionaries are used.
Each DICT server typically supports a number of strategies which
can be used to match words against a pattern. We use only the first
I match if found. In case of no match, we try
first I and second I match request.
We create link references to other glossentries, but only if
they are included in this glossary!
In B mode, the user is prompted for items.
We look up the definitions and display all that we get back from the server.
=head1 SYNOPSIS
B
[ B<--host=>I ]
[ B<--port=>I<2628> ]
[ B<--database=>I ]
[ B<--strategy=>I ]
[ B<--man> ]
[ B<--help> ]
[ B<--noxml> ]
[ B<--noCorrect> ]
[ B<--nobook> ]
[ B<--verbose> ]
[ B<--interactive> ]
I
=head2 EXAMPLES
foldoc2docbook.pl -i --noxml --host=dict.org -- -
foldoc2docbook.pl -x glossary.terms > glossary.xml 2> glossary.err
xmllint --valid --noout glossary.xml
xsltproc --nonet --timing -o glossary.html \
/usr/share/sgml/docbook-xsl-1.62.4/html/docbook.xsl glossary.xml
python ./acronymparser.py glossary.xml \
| sort -ubf | perl -pe 's/ +/ /g; s/^ +//; s/ +$//;' > acronymparser.log
=cut
=head1 OPTIONS
=over 4
=item B<-ho> I or B<--host>=I
The hostname for the DICT server.
If not specified the default B is used.
=item B<-p> I or B<--port>=I
Specify the port for connections (default is B<2628>, from C).
=item B<-da> I or B<--database>=I
The name of a specific database (dictionary) to query.
If not specified the default B is used.
=item B<-s> I or B<--strategy>=I
If not specified the default B is used as first search strategy.
=item B<-he> or B<--help>
Display a short help message including command-line options.
=item B<-m> or B<--man>
Display the full documentation for B.
=item B<-b> or B<--nobook>
In B mode (default C), we print the full B book
(B) file with DOCTYPE header. Useful to validate
with B or any other validating parser.
=item B<-C> or B<--noCorrect>
In B mode (default C), only the first strategy is used
=item B<-x> or B<--noxml>
In B mode (default C), each item read from I or I
will be requested from the desired DICT server and the output is
transformed to a B glossary form and printed to I
=item B<-i> or B<--interactive>
In B mode, the user is prompted for items.
We look up the definitions and display all that we get back from server.
=item B<-v> or B<--verbose>
Display verbose information as B runs.
=back
=head1 MATCH STRATEGIES USED
default only the first strategy is used
1.) exact Match words exactly
If B<--Correct> was set and there was no match, than two additionally sreach
strategy are used:
2.) lev Match words within Levenshtein distance one
3.) prefix Match prefixes
B strategies mostly available are:
exact Match words exactly
prefix Match prefixes
substring Match substring occurring anywhere in word
suffix Match suffixes
re POSIX 1003.2 (modern) regular expressions
regexp Old (basic) regular expressions
soundex Match using SOUNDEX algorithm
lev Match words within Levenshtein distance one
=head1 PREREQUISITES
This script uses the C, and C modules. It also requires
C,
C,
C.
=head1 COREQUISITES
B running on DICT server side with an up to date B dictionary.
=head1 OSNAMES
any
=head1 SCRIPT CATEGORIES
Publishing/docbook
Networking
Search
=head1 How to make a foldoc dictionary for your local dictd
Download the B package from http://www.freedict.de and install them
following the README instructions.
Download the current http://foldoc.doc.ic.ac.uk/foldoc/Dictionary.gz file and
B them in C directory.
After C; edit the C file and
insert or check the B entry. Now restart the B server.
=head1 SEE ALSO
Other man pages to check out, like dict(1), dictd(8), perl(1), Net::Dict(1) and pod2html(1)
=over 4
=item www.dict.org
The DICT home page, with all sorts of useful information.
There are a number of other DICT clients available.
http://www.dict.org/links.html
=item www.freedict.de
Many people asked how to generate dictionaries for DICT (and freedict).
- Well, it is not quite that difficult, if one uses the right tools.
http://www.freedict.de/How_make_dictionaries.html
=item www.foldoc.org
Instructions for FOLDOC Guest Editors
http://foldoc.doc.ic.ac.uk/foldoc/editing.html
=item dict
The B client written by the author of B,
Rik Faith (faith@cs.unc.edu).
The options are pretty much lifted from Rik's client.
=item DocBook XSL The Complete Guide
Bob Stayton (Second Edition) as book or online
http://www.sagehill.net/docbookxsl/ToolsPart.html
=item Docbook XSL
DocBook XSL Stylesheet Documentation
http://docbook.sourceforge.net/release/xsl/current/doc/
=item Docbook Book
DocBook The Definitive Guide
http://docbook.org/tdg/en/html/docbook.html
=back
=head1 CAVEATS
The items are sorted automatically, but case sensitive. If the inputfile is
already sorted, then this should be suppressed or changed.
But currently you have to change the script (search for C)!
=head1 KNOWN BUGS AND LIMITATIONS
=over 4
=item *
B doesn't know how to handle firewalls.
=item *
The authentication aspects of RFC 2229 aren't currently supported.
=back
=head1 RESTRICTIONS
Bugs I don't plan to fix :-(
The B (B) output should be valid, but it depends on the xref
entries and the dictionary contents. So you should be willing and able
to solve the ERRORS reported by B within an editor by hand.
=head1 REVISION
@(#) $NAME$ $Id: foldoc2docbook.pl,v 1.2 2003/12/06 17:00:00 klein Exp $
=head1 COPYRIGHT
Copyright 2002, 2003 Claus Klein (claus.klein@arcormail.de)
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 1, or (at your option) any
later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
675 Mass Ave, Cambridge, MA 02139, USA.
=head1 AUTHOR
Claus Klein (claus.klein@arcormail.de)
=head1 HISTORY
You might keep a modification log here.
=head1 TODO
=over 4
=item *
Transform (B) entries correct from B dictionary itmes.
i.e. (WWW, W3, The Web) from C
or (URL, previously "Universal") from C
=item *
I plan to implement most of this options:
/usr/bin/dict: option requires an argument -- h
dict 1.4.9/rf on Linux 2.4.19-4GB
Copyright 1997, 1998 Rickard E. Faith (faith@cs.unc.edu)
-h --host specify server
-p --port specify port
-d --database select a database to search
-m --match match instead of define
-s --strategy strategy for matching or defining
-c --config specify configuration file
-C --nocorrect disable attempted spelling correction
-D --dbs show available databases
-S --strats show available search strategies
-H --serverhelp show server help
-i --info show information about a database
-I --serverinfo show information about the server
-a --noauth disable authentication
-u --user username for authentication
-k --key shared secret for authentication
-V --version display version information
-L --license display copyright and license information
=back
=head1 README
foldoc2docbook - build a B glossary for items requested from a DICT server
This is a simple script which illustrates use of the
C perl module to get items from a DICT server.
In B mode, each items read from I or I
will be requested to the desired DICT server and the output is
transformed to a docbook glossary form and printed to I.
This script is based on:
dict - perl DICT client (for accessing network dictionary servers)
$ID: dict,v 1.2 2002/03/22 17:45:28 neilb Exp $
@(#) $NAME$ $Id: foldoc2docbook.pl,v 1.2 2003/12/06 17:00:00 klein Exp $
Claus Klein (claus.klein@arcormail.de)
=cut