#!/usr/bin/perl # see README below my $VERSION = '$Revision: 1.2 $'; # # Note: use perl always with warnings and strict to prevent a lot of problems! # use warnings; use strict; use Getopt::Long; use Pod::Usage; use Net::Dict; my $host = 'localhost'; my $port = '2628'; my $database = 'foldoc'; my $prompt = "define> "; my $dict; my $eref; my $entry; my $db; my $definition; my %glossaryterms = (); my $title = ''; my $opt_x = 1; # default on, see GetOptions my $opt_d = 0; # test only! #FIXME# my $opt_v = 0; # test only! #FIXME# my $opt_b = 1; # default on, create a valid docbook my $opt_C = 1; # default on (Correct) disable attempted spelling correction # i.e.: use strategy exact only my $strategy = 'exact'; #default for matching or defining my $man = 0; my $help = 0; my $interactive = 0; # # Parse options and print usage if there is a syntax error, # or if usage was explicitly requested. # # Getopt::Long supports two useful variants of simple options: # negatable options and incremental options. # A negatable option is specified with a exclamation mark "!" # after the option name. # GetOptions( 'help' => \$help, 'man' => \$man, 'xml!' => \$opt_x, 'Correct!' => \$opt_C, 'strategy=s' => \$strategy, 'book!' => \$opt_b, 'verbose+' => \$opt_v, 'host=s' => \$host, 'port=s' => \$port, 'database=s' => \$database, 'interactive!' => \$interactive, 'debug!' => \$opt_d, ) or pod2usage(2); pod2usage(1) if $help; pod2usage(-verbose => 2) if $man; ## If no arguments were given, then allow STDIN to be used only ## if it's not connected to a terminal (otherwise print usage) pod2usage("$0: No files given.") if ((@ARGV == 0) && (!$interactive) && (-t STDIN)); #----------------------------------------------------------------------- # Turn off buffering on STDOUT #----------------------------------------------------------------------- $| = 1; #----------------------------------------------------------------------- # Create instance of Net::Dict, connecting either to a user-specified # DICT server, or defaulting to dict.org #----------------------------------------------------------------------- ## $host = @ARGV > 0 ? shift @ARGV : 'dict.org'; warn "Connecting to $host ..." if $opt_v; $dict = Net::Dict->new( $host, Port => $port, Client => "foldoc2docbook $VERSION", ### Timeout => 120, Debug => $opt_d ) or die; warn "\n" if $opt_v; sub main { my $glossterm = ''; #----------------------------------------------------------------------- # This is the "SHOW DATABASES" command from RFC 2229. #----------------------------------------------------------------------- if ($opt_v) { my %dbhash = $dict->dbs(); warn "Available dictionaries:\n"; while (($db, $title) = each %dbhash) { warn "$db : $title\n" if defined $title; } } $title = $dict->dbTitle($database); if (!$title) { $database = 'wn' if $title = $dict->dbTitle('wn'); if (!$title) { $database = '*' if $title = $dict->dbTitle('*'); } } $dict->setDicts($database); #----------------------------------------------------------------------- # This returns a string, containing description of the dictionary $database. #----------------------------------------------------------------------- if ($opt_d and $opt_v) { my $dbInfo = $dict->dbInfo($database); warn "Used dictionary:\n"; warn $dbInfo; # # NOTE: uc() to support case insensitive sort! ck # $glossaryterms{uc('00-database-info')} = [[$database, $dbInfo]] if $opt_x; # # FIXME# after the dbInfo() call, the first define() call returns an empty reference or hang? # $eref = $dict->define($database); } $glossaryterms{uc($database)} = [[$database, "$database\n \{$title\}\n\n"]] # FIXME should end with '\n\n\n' if $opt_x; #----------------------------------------------------------------------- # Let the user repeatedly enter words, which we then look up. #----------------------------------------------------------------------- print $prompt if $interactive; while (<>) { chomp; # avoid \n on last field #TBD#prevent non-alphanumeric chars: # $_ =~ s/[^\w\d_]+/_/gmo; $_ =~ s/[\/_-]+/ /gmo; $_ =~ s/\s[\s]+/ /gmo; if (not $_ or defined $glossaryterms{uc($_)}) { print $prompt if $interactive; next; } #------------------------------------------------------------------- # The define() method returns an array reference. # The array has one entry for each definition found. # If the referenced array has no entries, then there were no # definitions in any of the dictionaries on the server. #------------------------------------------------------------------- ### we need the glossterm $eref = $dict->define($_); ### if (@$eref == 0) { #------------------------------------------------------------------- # The match() method returns an array reference. # You can lookup a list of words which match a given pattern, using # the match() method. Each DICT server typically supports a number of # strategies which can be used to match words against a pattern. #------------------------------------------------------------------- $eref = $dict->match($_, $strategy); if (@$eref == 0 and not $opt_C) { warn " use strategie 'lev' for \"$_\"\n" if $opt_v; $eref = $dict->match($_, 'lev'); if (@$eref == 0) { warn " use strategie 'prefix' for \"$_\"\n" if $opt_v; $eref = $dict->match($_, 'prefix'); } } if (@$eref != 0) { #--------------------------------------------------------------- # Each entry is another array reference. The referenced array # for each entry has two elements: # $db - the name of the database (ie dictionary) # $definition - the definition mached the request #--------------------------------------------------------------- foreach $entry (@$eref) { ($db, $glossterm) = @$entry; $eref = $dict->define($glossterm, $db); warn " found '$glossterm' in \"$db\"\n" if $opt_v; last; # Note: we need only one definition! ck } } } if (@$eref == 0) { warn " no definition for \"$_\"\n"; } else { #--------------------------------------------------------------- # Each entry is another array reference. The referenced array # for each entry has two elements: # $db - the name of the database (ie dictionary) # $definition - the text of the definition #--------------------------------------------------------------- foreach $entry (@$eref) { ($db, $definition) = @$entry; next if not defined $db and $opt_d; if ($opt_d or $interactive) { $title = $dict->dbTitle($db); warn " found '$glossterm' in \"$title\"\n" if (defined $title and $opt_v); warn "\n-------------------------------------------\n", $definition; } if ($opt_x) { # # this is a new glossterm (a foldoc definition): # # NOTE: uc() to support case insensitive sort! ck #TBD#prevent non-alphanumeric chars: # $glossterm =~ s/[^\w\d_]+/_/gmo; $glossterm =~ s/[\/_-]+/ /gmo; $glossterm =~ s/\s[\s]+/ /gmo; $glossaryterms{uc($glossterm)} = $eref; warn "found '$glossterm'" if $opt_v; last; # Note: we need only one definition! ck } } } print $prompt if $interactive; } if ($opt_x) { print_prolog(); # FIXME# should be sorted like sort -ubf ## foreach $glossterm ( sort { uc$a cmp uc$b } keys %glossaryterms) foreach $glossterm (sort keys %glossaryterms) { $eref = $glossaryterms{$glossterm}; #--------------------------------------------------------------- # Each entry is another array reference. The referenced array # for each entry has two elements: # $db - the name of the database (ie dictionary) # $definition - the text of the definition #--------------------------------------------------------------- #FIXME# ($db, $definition) = $eref[0]->@$entry; foreach $entry (@$eref) { ($db, $definition) = @$entry; $title = $dict->dbTitle($db); print_xmlout($db, $definition); last; # Note: we need only one definition! ck } } print_epilog(); } } sub print_prolog { print "\n"; print "\n" if $opt_b; print "\n" if $opt_b; print " \n"; print " \n"; print " \n"; print " \n"; print "*** Source: $title ***\n" if defined $title; print " \n"; print " \n"; print " \n"; } sub print_epilog { print " \n"; print "\n" if $opt_b; } sub print_xmlout { my($db, $data) = @_; $data = $data."\n"; my $in_glossdef = 0; my $nextline = ""; ### warn $data; return; $data =~ s/\r\n/\n/gmo; $data =~ s/\n\n\n/\n.\n/gmo; # mark the end of a foldoc definition! ck #TBD# $data =~ s/[\t ]{2+}/ /gmo; # prevent to many spaces in a paragraph! ck $data =~ s/\n\n/\n \n/gmo; # prevent empty lines! ck my @data = split(/\n/mo, $data); @data = reverse @data; #TBD# foreach my $line (@data) # Note: an empty line breaks this while loop, but we need it yet! ck while (my $line = pop @data) { if ($in_glossdef and $nextline = pop @data) { # read lines while in paragraph while ($nextline =~ /^\s+\S+/mo) # { $line = $line . $nextline; $nextline = pop @data; } # FIXME#this append at the end: push @data, $nextline; ## warn "keep in mind: '$nextline'"; warn "server sends: $line\n" if ($opt_v > 1); } { # xml (docbook) part #XXX# warn "server sends: $line\n" if ($opt_v > 1); if ($in_glossdef and $line =~ /^\.\s*$/mo) { # # Note: this match not in a multi line string! # warn "this is the END of a foldoc definition:"; # print " \n"; print " \n"; print " \n"; print "\n"; $in_glossdef = 0; } elsif ($line =~ /^\s+$/mo) { # # FIXME Note: this match not in a multi line string! # ### warn "this is a paragraph of a foldoc definition:"; # ### print " \n"; ### print " \n"; ### print " \n"; } elsif ($line =~ /^\w+.*$/mo) { # # this is a new glossterm (a foldoc definition): # # NOTE: uc() to support case insensitive sort! ck my $anchor = uc($line); # NOTE: prevent non-alphanumeric chars: $anchor =~ s/[\/_-]+/ /gmo; $anchor =~ s/[^\w\d_]+/_/gmo; $anchor =~ s/_[_]+/_/gmo; print " \n"; print " $line\n"; print " \n"; print " \n"; $in_glossdef = 1; } else { #================================================== # Note: the order is very important! ck #================================================== # # this is a subject classification (from a foldoc definition): # ## $line =~ s#(.*?)<(.+?)>(.*?)#$1$3#gmo; # # quote the xml keys: # if ( !( $line =~ s#<\b(\S+@\S+)\b>#$1#gmo ) ) { $line =~ s#\&#\&#gmo; $line =~ s#<#\<#gmo; $line =~ s#>#\>#gmo; $line =~ s#"#\"#gmo; } # # look for a single acronym with uc chars and mark them as glossterm # # $line =~ s#(.+?\()(.+?)([,)][^}].*?)#omg$1$2$3#gmo; # FIXME: dosn't match this! (WWW, W3, The Web) or (URL, previously "Universal") $line =~ s#(.*?)([(])(\s*?)([A-Z][A-Z0-9]+?)(,\s*|[)])(.*?)#$1$2$3$4$5$6#gmo; # # create link reference to other glossterm, but only if they are included in this glossary! # ## Note: this match only for first cross-references in a line: while ($line =~ /([^{\\]*?)\{([^)}]+?)\}(.*)/mo) { my $prev = $1; my $linkterm = $2; my $next = $3; #TBD#prevent non-alphanumeric chars: # $linkterm =~ s/[^\w\d_]+/_/gmo; $linkterm =~ s/[\/_-]+/ /gmo; $linkterm =~ s/\s[\s]+/ /gmo; # NOTE: uc() to support case insensitive sort! ck my $linkend = uc($linkterm); if ($glossaryterms{$linkend}) { # NOTE: prevent non-alphanumeric chars: $linkend =~ s/[^\w\d_]+/_/gmo; $linkend =~ s/_[_]+/_/gmo; warn "set link to $linkend" if ($opt_v > 1); $line = "$prev$linkterm$next"; } else { warn "### Not found: $linkterm" if $opt_v;; $line = "$prev$linkterm$next"; } } # # all other glossterm not included in this glossary! # { warn "### The rest: $2" if $line =~ s#([^{\\]*?)\{([^)}]+?)\}(.*?)#$1$2$3#gmo; #TBD# $line =~ s#(\s+)\bAddress:(.+?\.)(\s*?)#$1
$2
$3#gmo; $line =~ s#(\s+)\bTelephone:(.+?\.)(\s*?)#$1
$2
$3#gmo; # # foldoc change notice in ISO formart YYYY-MM-DD like: (1996-11-03) # $line =~ s#(.*?)(\(\d{4}.\d{2}.\d{2}\))(.*?)#$1$db$2$3#gmo; } # # this is an external reference (from a foldoc definition): # FIXME: {Home (http://www.ieee.org/)}. {Gopher (gopher://gopher.ieee.org/)}. {(ftp://ftp.ieee.org/)}. # Home . Gopher (gopher://gopher.ieee.org/)}. {. # from 'Institute of Electrical and Electronics Engineers, Inc.' # $line =~ s#([^{\\]*?)\{([^(]*?)\((http:\S+?)\)\}(.*?)#$1$2$4#gmo; $line =~ s#([^{\\]*?)\{([^(]*?)\((ftp:\S+?)\)\}(.*?)#$1$2$4#gmo; $line =~ s#([^{\\]*?)\{([^(]*?)\((gopher:\S+?)\)\}(.*?)#$1$2$4#gmo; $line =~ s#([^{\\]*?)\{([^(]*?)\((telnet:\S+?)\)\}(.*?)#$1$2$4#gmo; $line =~ s#([^{\\]*?)\{([^(]*?)(news:\S+?)\}(.*?)#$1$2$4#gmo; print "$line\n"; } # end xml (docbook) part } # FIXME# it seems $nextline is not used if we use forech loop? ck if ($nextline =~ /^\s+$/mo) { # warn "this is a paragraph of a foldoc definition:" if ($opt_v > 1); # print "
\n"; print " \n"; print " \n"; } elsif ($in_glossdef and $nextline =~ /^\.\s*$/mo) { # warn "this is the END of a foldoc definition:" if ($opt_v > 1); # print " \n"; print "
\n"; print "
\n"; print "\n"; $in_glossdef = 0; } } } main(); ######################### end of code part ######################### __END__ =head1 NAME foldoc2docbook - build a B glossary for items requested from a DICT server =head1 DESCRIPTION This is a simple script which illustrates use of the C perl module to get items from a DICT server. If no hostname is given, then the default DICT server C is used. In B mode, each item read from I or I will be requested from the desired DICT server and the output is transformed to a B glossary form and printed to I. The I dictionary is used if available, or the I dictionary. If neither (of both) is available, all available other dictionaries are used. Each DICT server typically supports a number of strategies which can be used to match words against a pattern. We use only the first I match if found. In case of no match, we try first I and second I match request. We create link references to other glossentries, but only if they are included in this glossary! In B mode, the user is prompted for items. We look up the definitions and display all that we get back from the server. =head1 SYNOPSIS B [ B<--host=>I ] [ B<--port=>I<2628> ] [ B<--database=>I ] [ B<--strategy=>I ] [ B<--man> ] [ B<--help> ] [ B<--noxml> ] [ B<--noCorrect> ] [ B<--nobook> ] [ B<--verbose> ] [ B<--interactive> ] I =head2 EXAMPLES foldoc2docbook.pl -i --noxml --host=dict.org -- - foldoc2docbook.pl -x glossary.terms > glossary.xml 2> glossary.err xmllint --valid --noout glossary.xml xsltproc --nonet --timing -o glossary.html \ /usr/share/sgml/docbook-xsl-1.62.4/html/docbook.xsl glossary.xml python ./acronymparser.py glossary.xml \ | sort -ubf | perl -pe 's/ +/ /g; s/^ +//; s/ +$//;' > acronymparser.log =cut =head1 OPTIONS =over 4 =item B<-ho> I or B<--host>=I The hostname for the DICT server. If not specified the default B is used. =item B<-p> I or B<--port>=I Specify the port for connections (default is B<2628>, from C). =item B<-da> I or B<--database>=I The name of a specific database (dictionary) to query. If not specified the default B is used. =item B<-s> I or B<--strategy>=I If not specified the default B is used as first search strategy. =item B<-he> or B<--help> Display a short help message including command-line options. =item B<-m> or B<--man> Display the full documentation for B. =item B<-b> or B<--nobook> In B mode (default C), we print the full B book (B) file with DOCTYPE header. Useful to validate with B or any other validating parser. =item B<-C> or B<--noCorrect> In B mode (default C), only the first strategy is used =item B<-x> or B<--noxml> In B mode (default C), each item read from I or I will be requested from the desired DICT server and the output is transformed to a B glossary form and printed to I =item B<-i> or B<--interactive> In B mode, the user is prompted for items. We look up the definitions and display all that we get back from server. =item B<-v> or B<--verbose> Display verbose information as B runs. =back =head1 MATCH STRATEGIES USED default only the first strategy is used 1.) exact Match words exactly If B<--Correct> was set and there was no match, than two additionally sreach strategy are used: 2.) lev Match words within Levenshtein distance one 3.) prefix Match prefixes B strategies mostly available are: exact Match words exactly prefix Match prefixes substring Match substring occurring anywhere in word suffix Match suffixes re POSIX 1003.2 (modern) regular expressions regexp Old (basic) regular expressions soundex Match using SOUNDEX algorithm lev Match words within Levenshtein distance one =head1 PREREQUISITES This script uses the C, and C modules. It also requires C, C, C. =head1 COREQUISITES B running on DICT server side with an up to date B dictionary. =head1 OSNAMES any =head1 SCRIPT CATEGORIES Publishing/docbook Networking Search =head1 How to make a foldoc dictionary for your local dictd Download the B package from http://www.freedict.de and install them following the README instructions. Download the current http://foldoc.doc.ic.ac.uk/foldoc/Dictionary.gz file and B them in C directory. After C; edit the C file and insert or check the B entry. Now restart the B server. =head1 SEE ALSO Other man pages to check out, like dict(1), dictd(8), perl(1), Net::Dict(1) and pod2html(1) =over 4 =item www.dict.org The DICT home page, with all sorts of useful information. There are a number of other DICT clients available. http://www.dict.org/links.html =item www.freedict.de Many people asked how to generate dictionaries for DICT (and freedict). - Well, it is not quite that difficult, if one uses the right tools. http://www.freedict.de/How_make_dictionaries.html =item www.foldoc.org Instructions for FOLDOC Guest Editors http://foldoc.doc.ic.ac.uk/foldoc/editing.html =item dict The B client written by the author of B, Rik Faith (faith@cs.unc.edu). The options are pretty much lifted from Rik's client. =item DocBook XSL The Complete Guide Bob Stayton (Second Edition) as book or online http://www.sagehill.net/docbookxsl/ToolsPart.html =item Docbook XSL DocBook XSL Stylesheet Documentation http://docbook.sourceforge.net/release/xsl/current/doc/ =item Docbook Book DocBook The Definitive Guide http://docbook.org/tdg/en/html/docbook.html =back =head1 CAVEATS The items are sorted automatically, but case sensitive. If the inputfile is already sorted, then this should be suppressed or changed. But currently you have to change the script (search for C)! =head1 KNOWN BUGS AND LIMITATIONS =over 4 =item * B doesn't know how to handle firewalls. =item * The authentication aspects of RFC 2229 aren't currently supported. =back =head1 RESTRICTIONS Bugs I don't plan to fix :-( The B (B) output should be valid, but it depends on the xref entries and the dictionary contents. So you should be willing and able to solve the ERRORS reported by B within an editor by hand. =head1 REVISION @(#) $NAME$ $Id: foldoc2docbook.pl,v 1.2 2003/12/06 17:00:00 klein Exp $ =head1 COPYRIGHT Copyright 2002, 2003 Claus Klein (claus.klein@arcormail.de) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. =head1 AUTHOR Claus Klein (claus.klein@arcormail.de) =head1 HISTORY You might keep a modification log here. =head1 TODO =over 4 =item * Transform (B) entries correct from B dictionary itmes. i.e. (WWW, W3, The Web) from C or (URL, previously "Universal") from C =item * I plan to implement most of this options: /usr/bin/dict: option requires an argument -- h dict 1.4.9/rf on Linux 2.4.19-4GB Copyright 1997, 1998 Rickard E. Faith (faith@cs.unc.edu) -h --host specify server -p --port specify port -d --database select a database to search -m --match match instead of define -s --strategy strategy for matching or defining -c --config specify configuration file -C --nocorrect disable attempted spelling correction -D --dbs show available databases -S --strats show available search strategies -H --serverhelp show server help -i --info show information about a database -I --serverinfo show information about the server -a --noauth disable authentication -u --user username for authentication -k --key shared secret for authentication -V --version display version information -L --license display copyright and license information =back =head1 README foldoc2docbook - build a B glossary for items requested from a DICT server This is a simple script which illustrates use of the C perl module to get items from a DICT server. In B mode, each items read from I or I will be requested to the desired DICT server and the output is transformed to a docbook glossary form and printed to I. This script is based on: dict - perl DICT client (for accessing network dictionary servers) $ID: dict,v 1.2 2002/03/22 17:45:28 neilb Exp $ @(#) $NAME$ $Id: foldoc2docbook.pl,v 1.2 2003/12/06 17:00:00 klein Exp $ Claus Klein (claus.klein@arcormail.de) =cut