#!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # If you use it/like it, send a postcard to the author. # Find the latest version in CPAN use Cwd qw(abs_path); use Getopt::Long; # use LWP::UserAgent; # use HTTP::Cookies; use URI::URL; use URI::Heuristic qw(uf_urlstr); use URI::Escape qw(uri_escape); use LWP::MediaTypes qw(media_suffix); # use HTTP::Status; use HTTP::Daemon; # use IO::Socket qw(getpeername); use CGI qw/escape unescape/; #if ($^O =~ /win32/i) { # eval { require Win32::Internet; }; #} use strict; #!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # If you use it/like it, send a postcard to the author. # Find the latest version in CPAN use LWP::UserAgent; use HTTP::Cookies; if ($^O =~ /win32/i) { eval { require Win32::Internet; }; } use strict; # ftp.pm -- modified from Gisle Aas' "LWP::Protocol::ftp" # by Flavio S. Glock # # oct-13-2000: Modified to include "REST" support # $Id: ftp.pm,v 1.27 1999/11/04 20:25:51 gisle Exp $ # Implementation of the ftp protocol (RFC 959). We let the Net::FTP # package do all the dirty work. package ftp; use Carp (); use HTTP::Status (); use HTTP::Negotiate (); use HTTP::Response (); use LWP::MediaTypes (); use File::Listing (); use Net::Cmd qw(CMD_MORE); require LWP::Protocol; our @ISA = qw(LWP::Protocol); use strict; eval { require Net::FTP; Net::FTP->require_version(2.00); }; my $init_failed = $@; my $DEBUG = 0; sub request { # arg is the receive-data callback subroutine my($request, $timeout, $arg) = @_; print " [ ftp::request BEGIN ]\n" if $DEBUG; if ($init_failed) { print " [ ftp::request DONE 2 ]\n" if $DEBUG; return HTTP::Response->new(&HTTP::Status::RC_INTERNAL_SERVER_ERROR, $init_failed); } my $size = 65536; my $method = 'GET'; my ( $url, $scheme, $host, $port, $user, $password ); eval { $url = $request->url; $scheme = $url->scheme; $host = $url->host; $port = $url->port; $user = $url->user; $password = $url->password; }; # If a basic autorization header is present than we prefer these over # the username/password specified in the URL. my($u,$p) = $request->authorization_basic; if (defined $u) { $user = $u; $password = $p; } # We allow the account to be specified in the "Account" header my $acct = $request->header('Account'); # try to make a connection my $ftp = Net::FTP->new($host, Port => $port); unless ($ftp) { $@ =~ s/^Net::FTP: //; print " [ ftp::request DONE 3 ]\n" if $DEBUG; return HTTP::Response->new(&HTTP::Status::RC_INTERNAL_SERVER_ERROR, $@); } # Create an initial response object my $response = HTTP::Response->new(&HTTP::Status::RC_OK, "Document follows"); $response->request($request); my $mess = $ftp->message; # welcome message $mess =~ s|\n.*||s; # only first line left $mess =~ s|\s*ready\.?$||; # Make the version number more HTTP like $mess =~ s|\s*\(Version\s*|/| and $mess =~ s|\)$||; $response->header("Server", $mess); $ftp->timeout($timeout) if $timeout; print " [ ftp::request Logging in as $user (password $password)... ]\n" if $DEBUG; unless ($ftp->login($user, $password, $acct)) { # Unauthorized. Let's fake a RC_UNAUTHORIZED response my $res = HTTP::Response->new(&HTTP::Status::RC_UNAUTHORIZED, scalar($ftp->message)); $res->header("WWW-Authenticate", qq(Basic Realm="FTP login")); print " [ ftp::request DONE 4 ]\n" if $DEBUG; return $res; } # Get & fix the path my @path = grep { length } $url->path_segments; my $remote_file = pop(@path); $remote_file = '' unless defined $remote_file; $ftp->binary; for (@path) { unless ($ftp->cwd($_)) { print " [ ftp::request DONE 5 ]\n" if $DEBUG; return HTTP::Response->new(&HTTP::Status::RC_NOT_FOUND, "Can't chdir to $_"); } } unless ($method eq 'GET' || $method eq 'HEAD') { print " [ ftp::request DONE 6 ]\n" if $DEBUG; return HTTP::Response->new(&HTTP::Status::RC_BAD_REQUEST, "Illegal method $method"); } if (my $mod_time = $ftp->mdtm($remote_file)) { $response->last_modified($mod_time); if (my $ims = $request->if_modified_since) { if ($mod_time <= $ims) { $response->code(&HTTP::Status::RC_NOT_MODIFIED); $response->message("Not modified"); print " [ ftp::request DONE 7 ]\n" if $DEBUG; return $response; } } } my $data; # the ftp data handle my $content; # Range: bytes=9500- my $range = $request->header("Range"); # request my ($content_begin) = $range =~ /bytes\s?\=\s?(\d+)\s?\-/; print " [ ftp::request remote_file: $remote_file ", length($remote_file), "]\n" if $DEBUG; print " [ ftp::request header: ", $request->as_string, " ]\n" if $DEBUG; print " [ ftp::request range: $range => $content_begin ]\n" if $DEBUG; # print " [ ftp::response header: ", $response->as_string, " ]\n" if $DEBUG; # my $content_range = $request->header("Content-Range"); # response # my $ok = $ftp->quot("REST $content_begin"); my $rest_ok = 0; unless ($ftp->_REST($content_begin)) { print " [ ftp::request rest: error ]\n" if $DEBUG; $ftp->_REST(0); # cancel last _REST $content_begin = 0; } else { $rest_ok = 1; print " [ ftp::request rest: ok ]\n" if $DEBUG; } if (length($remote_file) and $data = $ftp->retr($remote_file)) { print " [ ftp::request remote_file: $remote_file ]\n" if $DEBUG; my($type, @enc) = LWP::MediaTypes::guess_media_type($remote_file); $response->header('Content-Type', $type) if $type; for (@enc) { $response->push_header('Content-Encoding', $_); } my $mess = $ftp->message; my $content_length = 0; print " [ ftp::request mess: $mess $type ]\n" if $DEBUG; if ($mess =~ /\((\d+)\s+bytes\)/) { $content_length = $1; $response->header('Content-Length', $content_length); } if ($method ne 'HEAD') { # Read data from server into callback do { my ($size_read, $data_end); eval { $size_read = $data->read($content, $size); }; print " [ ftp::request Data: $size_read ]\n" if $DEBUG; # print " [ ftp::request Content: ",length($content)," ]\n" if $DEBUG; # print " [ ftp::request Size: $size ]\n" if $DEBUG; if (! $size_read) { # possibly a timeout $@ = 'No data'; print " [ ftp::request No data ]\n" if $DEBUG; $response->code(&HTTP::Status::RC_INTERNAL_SERVER_ERROR); $response->header('X-Died' => $@); $response->message("FTP close response: " . $ftp->code . " " . $ftp->message); $data->close; return $response; } if ($rest_ok and ! $@) { $data_end = $content_begin + $size_read; $response->header("Content-Range", "bytes ${content_begin}-${data_end}/$content_length"); $response->code(&HTTP::Status::RC_PARTIAL_CONTENT); print " [ ftp::request Content-Range: ", $response->header("Content-Range"), " ]\n" if $DEBUG; $content_begin = $data_end; } eval { &$arg($content, $response, undef ); } unless $@; if ($@) { chomp($@); $response->header('X-Died' => $@); last; } } while $content; } # if ne HEAD unless ($data->close) { # Something did not work too well if ($method ne 'HEAD') { $response->code(&HTTP::Status::RC_INTERNAL_SERVER_ERROR); $response->message("FTP close response: " . $ftp->code . " " . $ftp->message); } } } elsif (!length($remote_file) || $ftp->code == 550) { print " [ ftp::request remote_file: (none) ]\n" if $DEBUG; # 550 not a plain file, try to list instead if (length($remote_file) && !$ftp->cwd($remote_file)) { print " [ chdir before listing failed ]\n" if $DEBUG; print " [ ftp::request DONE 8 ]\n" if $DEBUG; return HTTP::Response->new(&HTTP::Status::RC_NOT_FOUND, "File '$remote_file' not found"); } # It should now be safe to try to list the directory my @lsl = $ftp->dir; # Try to figure out if the user want us to convert the # directory listing to HTML. my @variants = ( ['html', 0.60, 'text/html' ], ['dir', 1.00, 'text/ftp-dir-listing' ] ); #$HTTP::Negotiate::DEBUG=1; my $prefer = HTTP::Negotiate::choose(\@variants, $request); my $content = ''; if (!defined($prefer)) { print " [ ftp::request DONE 9 ]\n" if $DEBUG; return HTTP::Response->new(&HTTP::Status::RC_NOT_ACCEPTABLE, "Neither HTML nor directory listing wanted"); } elsif ($prefer eq 'html') { $response->header('Content-Type' => 'text/html'); $response->header('Content-Location' => "$url/") unless $url =~ /\/$/; $content = "File Listing\n"; $content .= "\n\n"; $content .= "

Directory listing of $url

\n"; $content .= "
Up to higher level directory\n";
                    # my $base = $request->url->clone;
                    # my $path = $base->epath;
                    # $base->epath("$path/") unless $path =~ m|/$|;
                    # $content .= qq(\n\n);

                    for (File::Listing::parse_dir(\@lsl, 'GMT')) {
                            my($name, $type, $size, $mtime, $mode) = @$_;
                            $content .= qq(  
  • $name); # $type, $size, $mtime, $mode); $content .= " $size bytes" if $type eq 'f'; $content .= " => $1" if $type =~ /l\s*(.*)/; $content .= "\n"; # \n or PRE } $content .= "
  • \n"; } else { $response->header('Content-Type', 'text/ftp-dir-listing'); $content = join("\n", @lsl, ''); } $response->header('Content-Length', length($content)); if ($method ne 'HEAD') { # $response = LWP::Protocol::collect_once($arg, $response, $content); eval { &$arg($content, $response, undef ); }; # send content eval { &$arg(undef, $response, undef ); }; # finish } } else { print " [ ftp::request Returning message instead of file. data=[$data] ]\n" if $DEBUG; my $res = HTTP::Response->new(&HTTP::Status::RC_BAD_REQUEST, "FTP return code " . $ftp->code); $res->content_type("text/plain"); $res->content($ftp->message); print " [ ftp::request DONE 10 ]\n" if $DEBUG; return $res; } print " [ ftp::request DONE 1 ]\n" if $DEBUG; return $response; } # "pod" removed. See LWP::Protocol::ftp #!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # If you use it/like it, send a postcard to the author. # Find the latest version in CPAN use strict; package glynx_file; our $VERSION = "1.033"; # fixed "POST" contents our $verbose; our $quiet; our $NAME_TRANSLATION_FILE = "_NAMES_.HTM"; our $TMP_SUFFIX = "._TMP_"; our $INDEXFILE = "_INDEX_.HTM"; our $PART_SUFFIX = "._PART_"; sub my_link { # note: link will COPY files on Windows NT; fatal error on win98 my ($source, $dest) = @_; return if $source eq $dest; unless (-e $source) { print " [ LINK: CAN'T FIND $source ]\n" unless $quiet; return; } if (-d $source) { print " [ LINK: CAN'T LINK FROM DIRECTORY ]\n" unless $quiet; return; } if (-e $dest) { print " [ LINK: ALREADY EXISTS: $dest ]\n" unless $quiet; return; } print " [ LINK: $source to $dest ]\n" if $verbose; # link ($source, $dest); &my_copy ($source, $dest); } sub my_unlink { my ($source) = @_; if (-d $source) { print " [ ERR: WILL NOT UNLINK DIRECTORY ]\n"; return; } if (-e $source) { unlink $source or print " [ ERR: UNLINK $source - $^E ]\n"; } } sub my_touch { my ($source) = @_; my ($now); return if ! $source; print " [ TOUCH: $source ]\n" if $verbose; $now = time; utime $now, $now, $source; } sub my_create_empty { my ($source) = @_; print " [ CREATE-EMPTY: $source ]\n" if $verbose; open (FILE, ">>$source"); binmode FILE; print FILE ""; close (FILE); } sub my_copy { my ($source, $dest) = @_; return if $source eq $dest; unless (-e $source) { print " [ COPY: CAN'T FIND $source ]\n"; return; } if (-d $source) { print " [ COPY: CAN'T COPY DIRECTORY ]\n"; return; } &my_unlink ($dest); print " [ COPY: $source, $dest ]\n" if $verbose; open (FILE1, $source) or print " [ ERR: CAN'T READ $source - $^E ]\n"; open (FILE2, ">$dest") or print " [ ERR: CAN'T CREATE $dest - $^E ]\n"; binmode FILE1; binmode FILE2; local($\) = ""; # ensure standard $OUTPUT_RECORD_SEPARATOR while () { print FILE2 $_; } close (FILE2); close (FILE1); # (adapted from: UserAgent.pm) if (my $lm = (stat($source))[9] ) { # make sure the file has the same last modification time utime $lm, $lm, $dest; } } sub my_rename { my ($source, $dest) = @_; return if $source eq $dest; unless (-e $source) { # print " [ RENAME: CAN'T FIND $source ]\n" if $verbose; return; } &my_unlink ($dest); unless (rename $source, $dest) { # print " [ RENAME: CAN'T RENAME $source $dest - $^E ]\n"; print " [ RENAME: $source, $dest ]\n" if $verbose; &my_copy ($source, $dest); &my_unlink ($source); return; } } sub make_dir { # o parametro para make_dir deve incluir a base my ($name) = @_; return if (-d $name); my (@a, $a, $b, $temp, $dest); # cria o diretorio @a = split('/', $name); push @a, '' if $name =~ /\/$/; $a = ''; foreach(0 .. $#a - 1) { $a .= $a[$_] . '/'; } if (-d $a) { print " [ DIR: $a ok ]\n" if $verbose; return; } $b = $a; $b =~ s/\/$//; if (-e $b) { print " [ MAKE-DIR: $a is a file ]\n" if $verbose; $temp = $b . $TMP_SUFFIX; print " [ MAKE-DIR: MOVE: $b => $temp ]\n" if $verbose; &my_rename ($b, $temp); mkdir $a, 0777; $dest = $b . '/' . $INDEXFILE; print " [ MAKE-DIR: MOVE: $temp => $dest ]\n" if $verbose; &my_rename ($temp, $dest); } $a = ''; foreach(0 .. $#a - 1) { $a .= $a[$_] . '/'; if (-d $a) { # print " [ DIR: $a ok ]\n" if $verbose; } else { print " [ MAKE-DIR: $a ]\n" if $verbose; mkdir $a, 0777; } } } sub modify_file_attrib { my ($name,$attrib,$value) = @_; my ($filename, $parent, @a, $trans_filename); $name =~ s/${PART_SUFFIX}$//; # remove suffix on log ($parent, $filename) = $name =~ /^(.*)\/(.*?)$/; print " [ MODIFY-FILE-ATTRIB: $parent -> $filename ATTRIB: $attrib: $value ]\n" if $verbose; &create_translation_file($parent) unless -e $trans_filename; $trans_filename = "$parent/$NAME_TRANSLATION_FILE"; open (TRFILE, $trans_filename); @a = ; close (TRFILE); foreach( grep { />\Q${filename}\E/ } @a ) { return if (/ $attrib: $value /); s/\s?
    $/ $attrib: $value
    /; print " [ MODIFY-FILE-ATTRIB: $_ ]\n" if $verbose; goto SAVE_MODIFY; } push @a, "$filename $attrib: $value
    \n"; SAVE_MODIFY: &make_dir($trans_filename); open (TRFILE, ">$trans_filename") or print " [ ERR: WRITING $trans_filename - $^E ]\n"; print TRFILE join('', @a); close (TRFILE); print " [ MODIFY-FILE-ATTRIB: NEW: $a[-1] ]\n" if $verbose; } sub get_file_attrib { my ($name,$attrib) = @_; my $value = ''; my ($filename, $parent, @a, $trans_filename); $name =~ s/${PART_SUFFIX}$//; # remove suffix on log ($parent, $filename) = $name =~ /^(.*)\/(.*?)$/; print " [ GET-FILE-ATTRIB: $parent -> $filename ATTRIB: $attrib: $value ]\n" if $verbose; $trans_filename = "$parent/$NAME_TRANSLATION_FILE"; open (TRFILE, $trans_filename); @a = ; close (TRFILE); foreach( grep { />\Q${filename}\E/ } @a ) { return $1 if (/ $attrib: (\S*) /); } return ''; } sub create_translation_file { my ($parent) = @_; my ($trans_filename); my ($content); my ($new_parent); $trans_filename = "$parent/$NAME_TRANSLATION_FILE"; return if -e $trans_filename; &make_dir($trans_filename); $new_parent = $parent . '/'; $new_parent =~ s|\/\/|\/|g; $content = "File Listing\n"; $content .= "\n\n"; $content .= "

    Directory listing of $new_parent

    \n"; $content .= "Up to higher level directory
    \n"; # $content .= "
    ";
            open (TRFILE, ">$trans_filename") or print "  [ ERR: WRITING $trans_filename - $^E ]\n"; 
                print TRFILE $content;
            close (TRFILE);
        }
    
        #
        # returns a shorter name if there is one
        #
        sub check_translation_file {
            my ($filename, $parent) = @_;
            my ($trans_filename, @a, $tr_str, $new_name);
            # do we have a $NAME_TRANSLATION_FILE ?
            $trans_filename = "$parent/$NAME_TRANSLATION_FILE";
            if (-s $trans_filename) {
                open (TRFILE, $trans_filename); 
                    @a = ; 
                close (TRFILE);
                ($tr_str) = grep { />\Q${filename}\E$filename
    \n" ($new_name) = $tr_str =~ /=(.*?)>/; # print " [ SHORTER-NAME: FOUND: $tr_str => $new_name ]\n" if $VERBOSE; return $new_name; } } return ''; } # # returns a bigger name if there is one # sub check_translation_url_node { my ($filename, $parent) = @_; my ($tr_str, $trans_filename, @a, $new_name); # do we have a $NAME_TRANSLATION_FILE ? $trans_filename = "$parent/$NAME_TRANSLATION_FILE"; if (-s $trans_filename) { open (TRFILE, $trans_filename); @a = ; close (TRFILE); ($tr_str) = grep { /=\Q${filename}\E>/ } @a; if ($tr_str) { # "$filename
    \n" ($new_name) = $tr_str =~ />(.*?) $new_name ]\n" if $VERBOSE; return $new_name; } } return ''; } # # stores a new shorter name # sub log_translation_file { my ($filename, $new_name, $parent) = @_; my ($trans_filename); $trans_filename = "$parent/$NAME_TRANSLATION_FILE"; &create_translation_file($parent) unless -e $trans_filename; open (TRFILE, ">>$trans_filename") or print " [ ERR: WRITING $trans_filename - $^E ]\n"; print TRFILE "$filename
    \n"; close (TRFILE); # print " [ SHORTER-NAME: LOGGED: $new_name as $filename at $trans_filename ]\n" if $VERBOSE; return; } 1; package glynx; #require Exporter; #@ISA = qw(Exporter); #@EXPORT = qw(download); our $VERSION = "1.033"; # parameters to "download" hash our $filename; our $content_length; our $url; our $referer = "."; our $agent = "Mozilla/3.0 (WinNT; I)"; our $timeout = 30; our $cookie_file; our $verbose; our $quiet; our $auth = ''; our $post_separator = "_X_POST_X_"; our $mtime; our $PART_SUFFIX = "._PART_"; our $real_name; our $NAME_TRANSLATION_FILE = $glynx_file::NAME_TRANSLATION_FILE; our $TMP_SUFFIX = $glynx_file::TMP_SUFFIX; our $INDEXFILE = $glynx_file::INDEXFILE; # general control our $ua; our $Boundary; our $num_callback; our $file_position; # Defaults our $DEFAULT_PART_SIZE = 4096 * 4; sub select_best_sample { my ($part_filename) = @_; my ($filename); $filename = $part_filename; $filename =~ s/${PART_SUFFIX}$//; # remove suffix $part_filename = $filename . $PART_SUFFIX; my $msg = " [ SELECT-SAMPLE: ERROR $filename"; # escolhe a melhor tentativa if (-s "$filename" > 0) { print " [ SELECT-SAMPLE: EXISTS: $filename ]\n" if $verbose; # ja existe o arquivo pronto - apaga os outros &glynx_file::my_unlink ("$part_filename"); &glynx_file::my_unlink ("${part_filename}-1"); return; } if (! (-e "${part_filename}-1")) { print " [ SELECT-SAMPLE: KEEP: ${part_filename}-1 ]\n" if $verbose; # nao existe outra alternativa return; } if (! (-e "$part_filename")) { print " [ SELECT-SAMPLE: KEEP: $part_filename ]\n" if $verbose; # nao existe outra alternativa &glynx_file::my_rename ("${part_filename}-1", "$part_filename"); return; } # existem ...suffix e ...suffix-1 -- deve escolher o maior if (+(-s "$part_filename") > +(-s "${part_filename}-1")) { print " [ SELECT-SAMPLE: BIGGER: $part_filename ]\n" if $verbose; &glynx_file::my_unlink ("${part_filename}-1"); return; } # ...suffix-1 is bigger -- delete ...suffix and rename ...suffix-1 print " [ SELECT-SAMPLE: BIGGER: ${part_filename}-1 ]\n" if $verbose; &glynx_file::my_rename ("${part_filename}-1", "$part_filename"); } sub get_random { my $a; $a = int(rand(1000)); return substr('000' . $a, -3,3); } sub download_callback { my($data, $response, $protocol) = @_; my ($content_begin, $content_length, $Data_header, $data1); # "$filename", "$real_name", "$num_callback", "$file_position" are global # my ($real_filename); $num_callback++; # The callback function is called with 3 arguments: the data received this time, a # reference to the response object and a reference to the protocol object. # testa se a resposta e' do tipo 206 Partial Content # Content-Length: 10000 # Content-Range: bytes 10329-20328/20329 # print " [ RANGE: RESPONSE->HEADER = ", $response->header, " ] \n" if $verbose; print "." if $verbose; # print " [ DATA = ", escape($data), " ] \n" if $verbose; if ($num_callback == 1) { $file_position = 0; if ($response->code == 200) { ($content_length) = $response->header("Content-Length") =~ /(\d+)/; print " [ content_length = $content_length ] \n" if $verbose; &glynx_file::modify_file_attrib($filename, 'Content-Length', $content_length) if $content_length; } if ($response->code == 206) { if ($response->header("Content-Range") =~ /bytes\s+(\d+)-/) { $content_begin = $1; $file_position = $1; } ($content_length) = $response->header("Content-Range") =~ /\/(\d+)/; print " [ content_length = $content_length ] \n" if $verbose; &glynx_file::modify_file_attrib($filename, 'Content-Length', $content_length) if $content_length; #print " [ BEGIN = ", $content_begin, " ] \n"; if (-s "$filename" != $content_begin) { if ($response->header("Content-Type") =~ /multipart\/x-byteranges;\s*boundary=(.*)$/) { $Boundary = $1; print " [ MULTIPART: BOUNDARY = $Boundary ] \n" if $verbose; # ($Data_header, $data1) = $data =~ /\015\012--${Boundary}\015\012(.*?)\015\012\015\012(.*)$/s; ($data1) = $data =~ /\015\012--${Boundary}\015\012(.*)/s; # ($Data_header, $data1) = split("\015\012\015\012", $data1, 2); ($Data_header, $data) = $data1 =~ /^(.*?)\015\012\015\012(.*)$/s; print " [ MULTIPART: DATA-HEADER = $Data_header ] \n" if $verbose; foreach( split("\015\012", $Data_header)) { my ($header, $content) = split (': ', $_, 2); $response->header($header => $content); } ($content_length) = $response->header("Content-Range") =~ /\/(\d+)/; print " [ content_length = $content_length ] \n" if $verbose; &glynx_file::modify_file_attrib($filename, 'Content-Length', $content_length) if $content_length; print " [ MULTIPART: DATA: ", length($data), " BYTES ]\n" if $verbose; # $data = $data1; ($content_begin) = $response->header("Content-Range") =~ /bytes\s+(\d+)-/; if (-s "$filename" != $content_begin) { $response->code(500); die "Wrong range: multipart"; } $file_position = $content_begin; } else { $response->code(500); die "Wrong range"; } } } else { # Nao aceita resume #die "Nao aceita resume"; # circula os arquivos de tentativas - depois deve escolher a melhor &select_best_sample($filename); &glynx_file::my_rename ("$filename", "${filename}-1"); # normal download to file print " [ NO-RESUME: Novo request ]\n" if $verbose; &glynx_file::my_create_empty ("$filename"); $file_position = 0; } } $data =~ s/\015\012--${Boundary}\015\012(.*)//s if $Boundary; # end of multipart print " [ CALLBACK: WRITE $file_position, ", length($data), " BYTES ]\n" if $verbose; if ($file_position != (-s $filename)) { $response->code(500); die "Wrong file size"; } # print " [ random: ", get_random, " ", get_random, " ]\n" if $verbose; print " [ CALLBACK: file_size: ",-s FILE," ]\n" if $verbose; print " [ CALLBACK: content_begin: $content_begin ]\n" if $verbose; my $tmp_filename = "${filename}-" . get_random; &glynx_file::my_rename ("$filename", "$tmp_filename"); open(FILE, ">>$tmp_filename") or die "Cannot write to $tmp_filename"; binmode(FILE); local($\) = ""; # ensure standard $OUTPUT_RECORD_SEPARATOR # seek (FILE, $file_position, 0); print FILE $data; close(FILE); &glynx_file::my_rename ("$tmp_filename", "$filename"); $file_position += length($data); print " [ CALLBACK = ", $num_callback, " ] \n" if $verbose; #print " [ RESPONSE->CODE = ", $response->code, " ] \n" if $verbose; print " [ RESPONSE->Content-Range = ", $response->header("Content-Range"), " ] \n" if $verbose; } sub timeout { $ua->timeout($_[0]); } sub download { my ($res, $file_size, $INET, $data); my ($content, $content_begin, $content_end, $content_range, $content_size, $content_difference); # $content_length = &glynx_file::get_file_attrib($filename, 'Content-Length'); my %cnf = @_; for (keys %cnf) { eval ("\$" . $_ . " = '" . $cnf{$_} . "'"); } if ($ua) { $ua->cookie_jar->load($cookie_file) if $cookie_file; } else { $ua = LWP::UserAgent->new; $ua->agent($agent); $ua->timeout($timeout); $ua->env_proxy(); $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1, ignore_discard => 1)) if $cookie_file; } $real_name = $filename unless ($real_name); # GET: print " [ GET: $url ]\n" unless $quiet; my $req = HTTP::Request->new(GET => $url); $req->referer($referer . ''); # declare preference for "html" directory listings, if "ftp" $req->header('Accept' => 'text/html;q=1.0,*/*;q=0.6'); $req->authorization_basic(split (/:/, $auth),2) if $auth; if ($mtime) { print " [ If-Modified-Since: ", HTTP::Date::time2str($mtime), " ]\n" if $verbose; $req->header('If-Modified-Since' => HTTP::Date::time2str($mtime)); } # this should happen automatically - what's wrong? $ua->cookie_jar->add_cookie_header($req) if $cookie_file; # $download_success = 1; $Boundary = ''; # RESUME: # The first-byte-pos value in a byte-range-spec gives the byte-offset # of the first byte in a range. The last-byte-pos value gives the # byte-offset of the last byte in the range; that is, the byte # positions specified are inclusive. Byte offsets start at zero. # Range: bytes=9500- &select_best_sample($filename); $file_size = 0 + (-s "$filename"); $num_callback = 0; print " [ RESUME: from byte $file_size ]\n" if $verbose; $content_length = &glynx_file::get_file_attrib($filename, 'Content-Length'); $content_length = $file_size + $DEFAULT_PART_SIZE unless $content_length; $req->push_header("Range" => "bytes=$file_size-$content_length") if $file_size; print " [ REQUEST = ", $req->as_string, " ] \n" if $verbose; print " [ REQUEST: PROXY = ", $ua->proxy, " ] \n" if $ua->proxy and $verbose; if ($url =~ /ftp:\/\//i) { $res = ftp::request($req, $timeout, \&download_callback); } else { if ($url =~ /(.*?)$post_separator(.*)/) { $content = $2; $req->uri($1); $req->content($content); $req->method("POST"); $req->push_header("Content-Length" => length($content)); $req->content_type('application/x-www-form-urlencoded'); print " [ POST: ", $req->as_string(), " ]\n" if $verbose; print " [ POST: url = $url ; $1 $post_separator $content ]\n" if $verbose; } # $res = http::request($req, $ua->proxy, \&download_callback, 65536, $TIMEOUT); $res = $ua->request($req, \&download_callback, 65536); # 65536); } # try to fix https if (($res->code == 501) and ($^O =~ /win32/i) and ($url =~ /https:\/\//i)) { my $proxy = $ua->proxy; # format ? my $agent = $ua->agent; my $opentype; eval " \$opentype = \$proxy ? INTERNET_OPEN_TYPE_PROXY : INTERNET_OPEN_TYPE_DIRECT; "; # [useragent, opentype, proxy, proxybypass, flags] $INET = new Win32::Internet(); # ($agent, $opentype, $proxy, '', INTERNET_FLAG_ASYNC); print " [ HTTPS: WIN32: $url -- $filename ]\n" if $verbose; $data = $INET->FetchURL($url); # print " [ HTTPS: WIN32: $data ]\n" if $verbose; if ($data) { $res->code(200); open(FILE, ">$filename") or die "Cannot write to $filename"; binmode(FILE); local($\) = ""; # ensure standard $OUTPUT_RECORD_SEPARATOR print FILE $data; close(FILE); goto DOWNLOAD_OK; # can't do redirects, etc. } } if ($res->header("X-Died")) { # circula os arquivos de tentativas - depois deve escolher a melhor &select_best_sample($filename); } # - look at the 3rd parameter on "206" # (when available -- otherwise it may be 500 Timeout), # Content-Length: 637055 --> if "206" this is "chunk" size # Content-Range: bytes 1449076-2086130/2086131 --> THIS is file size $content_range = $res->header("Content-Range"); if (($res->code == 206) and $content_range) { ($content_begin, $content_end, $content_size) = $content_range =~ /bytes\s+(\d+)-(\d+)\/(\d+)/; $file_size = -s "$filename"; $content_difference = $content_size - $file_size; if ($content_difference > 0) { # $download_success = 0; # not ready yet print " [ CONTENT: MISSING: $content_difference/$content_size BYTES ] \n" if $verbose; } else { print " [ CONTENT: OK: $file_size/$content_size BYTES ] \n" if $verbose; $res->code(200); } } DOWNLOAD_OK: # arriving here from FILE: (cache) or from HTTP: $ua->cookie_jar->save() if $cookie_file; print " [ COOKIES: ", $ua->cookie_jar->as_string(), " ]\n" if $cookie_file and $verbose; return $res; } # end: download 1; =head1 NAME Glynx - a download manager. =head1 DESCRIPTION Glynx makes a local image of a selected part of the internet. It currently supports resume/retry, referer, user-agent, frames, distributed download (see C<--slave>, C<--stop>, C<--restart>). It partially supports: redirect (using file-copy), java, javascript, multimedia, authentication (only basic), mirror, translating links to local computer (C<--makerel>), correcting file extensions, ftp, renaming too long filenames and too deep directories, cookies, proxy, forms, multipart responses. A http user interface is included. https: works only in Windows, with GET method. It should work in other OS and with other methods if openssl is installed. It can be used together with other download managers, making a distributed download process. =head1 SYNOPSIS =over =item Work with the http interface: glynx.pl - then type in the given address in your browser. - you will need other instances of the program (at least one) working as slaves. Just double-click the program again to open another instance, or use: glynx.pl --slave =item Command-line, do-everything at once: glynx.pl [options] =item Save work to finish later: glynx.pl [options] --dump="download-list-file" =item Finish saved download: glynx.pl [options] "download-list-file" =item Network mode (server/client/slave) =item - Clients: glynx.pl [options] --dump="download-list-file" or: glynx.pl --server [--port=8081] Starts a client-interface http server =item - Slaves (will wait until there is something to do): glynx.pl [options] --slave =back =head1 HINTS If you don't use command line options, the first program instance will be an http server. Remaining instances will be slaves ("executors"). How to create a default configuration: Start the program with all command-line configurations, plus --cfg-save or: 1 - start the program with --cfg-save 2 - edit glynx.ini file --subst, --exclude and --loop use regular expressions. http://www.site.com/old.htm --subst=s/old/new/ downloads: http://www.acme.com/new.htm - Note: the substitution string MUST be made of "valid URL" characters --exclude=/\.gif/ will not download ".gif" files - Note: Multiple --exclude are allowed: --exclude=/gif/ --exclude=/jpeg/ will not download ".gif" or ".jpeg" files It can also be written as: --exclude=/\.gif|\.jp.?g/i matching .gif, .GIF, .jpg, .jpeg, .JPG, .JPEG --exclude=/www\.site\.com/ will not download links containing the site name http://www.site.com/bin/index.htm --prefix=http://www.site.com/bin/ won't download outside from directory "/bin". Prefix must end with a slash "/". http://www.site.com/index%%%.htm --loop=%%%:0..3 will download: http://www.site.com/index0.htm http://www.site.com/index1.htm http://www.site.com/index2.htm http://www.site.com/index3.htm - Note: the substitution string MUST be made of "valid URL" characters - For multiple exclusion: use "|". - Don't read directory-index: ?D=D ?D=A ?S=D ?S=A ?M=D ?M=A ?N=D ?N=A => \?[DSMN]=[AD] To change default "exclude" pattern - put it in the configuration file Note: "File:" item in dump file is ignored You can filter the processing of a dump file using --prefix, --exclude, --subst If after finishing downloading you still have ".PART._BUSY_" files in the base directory, rename them to ".PART" (the program should do this by itself) Don't do this: --depth=1 --out-depth=3 because "out-depth" is an upper limit; it is tested after depth is generated. The right way is: --depth=4 --out-depth=3 This will do nothing: --dump=x graphic.gif because the dump file gets all binary files. Errors using https: [ ERROR 501 Protocol scheme 'https' is not supported => LATER ] or [ ERROR 501 Can't locate object method "new" via package "LWP::Protocol::https" => LATER ] This means you need to install at least "openssl" (http://www.openssl.org), Net::SSLeay and IO::Socket::SSL This DOS batch file will download a site with TRANSLATED pages: glynx.pl "http://world.altavista.com/urltrurl?lp=en_pt&url=http%%3A%%2F%%2F%1" --referer="http://world.altavista.com/urltrurl?lp=en_pt&url=http%%3A%%2F%%2F%1" --verbose --depth=1 --prefix=altavista --prefix=%1 --cookies=kuki.txt --timeout=120 substitute en_pt for your language pair. =head1 COPYRIGHT Copyright (c) 2000 Flavio Glock . All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. This program was based on examples in the Perl distribution. If you use it/like it, send a postcard to the author. =head1 COMMAND-LINE OPTIONS Very basic: --version Print version number (1.033) and quit --verbose More output --quiet No output --help This page --cfg-save Save configuration to file "glynx.ini" --base-dir=DIR Place to load/save files (default is ".") Development only: --make-cpan Preprocess files to make Glynx distribution Download options are: --sleep=SECS Sleep between gets, ie. go slowly (default is 1) --prefix=PREFIX Limit URLs to those which begin with PREFIX (default is URL) Multiple --prefix are allowed --depth=N Maximum depth to traverse (default is 0) --out-depth=N Maximum depth to traverse outside of PREFIX (default is 0) --referer=URI Set initial referer header (default is ".") --limit=N A limit on the number documents to get (default is 10000) --retry=N Maximum number of retrys (default is 5) --timeout=SECS Timeout value - increases on retrys (default is 30) --agent=AGENT User agent name (default is "Mozilla/3.0 (WinNT; I)") --mirror Checks all existing files for updates (default is --nomirror) --mediaext Creates a file link, guessing the media type extension (.jpg, .gif) (Windows perl makes a file copy) (default is --nomediaext) --makerel Make Relative links. Links in pages will work in the local computer. --auth=USER:PASS Set authentication credentials --cookies=FILE Set up a cookies file (default is no cookies) --name-len-max Limit filename size (default is 30) --dir-depth-max Limit directory depth (default is 8) Multi-process control: --slave Wait until a download-list file is created (be a slave) --server Be an http user-interface server, with slave processes --children How many slaves will this process spawn (default is "0") --port=N Http server TCP/IP port (default is "8081") --stop Stop slave --restart Stop and restart slave Other: --indexfile=FILE Index file in a directory (default is "_INDEX_.HTM") --part-suffix=.SUFFIX (default is "._PART_") (example: ".Getright" ".PART") --dump=FILE (default is "") make download-list file, to be used later --dump-max=N (default is 30) number of links per download-list file --dump-subdir Make a hiearchical dump (no backlinks). Goes faster. --invalid-char=C (default is "$") --exclude=/REGEXP/x (default is "") Don't download matching URLs Multiple --exclude are allowed --loop=REGEXP:INITIAL..FINAL (default is "") (eg: xx:a,b,c xx:'01'..'10') --subst=s/REGEXP/VALUE/x (default is "") ("\" must be written "\\") --404-retry will retry on error 404 Not Found (default). --no404-retry creates an empty file on error 404 Not Found. =head1 README Glynx - a download manager (robot) INSTALLATION WINDOWS: - Copy the script to your download directory, such as c:\glynx or c:\temp Do not use it on c:\ because it will create files there. - It can be used as command prompt script, or you can double click it to start it as a server or slave. You can open the program more times, if you have enough memory. If the programs are started from the same directory, they will work together, even if they are on different computers. - As a server, you can access it from your browser when you see the message: User interface server running at http://localhost:8081/ - The latest ActivePerl has all the modules needed. UNIX/LINUX: - Please note that the software will create MANY files in its work directory, so it is advisable to have a dedicated sub-directory for it. - chmod +x glynx.pl (if necessary) pod2html glynx.pl -outfile=glynx.htm (this is optional) - under RedHat 6.2 I had to upgrade or install these modules: HTML::Tagset MIME:Base64 URI HTML::Parser Digest::MD5 libnet libwww-perl - to use https you will need: openssl (www.openssl.org) Net::SSLeay IO::Socket::SSL How do I run Glynx? Before you run Glynx, you should be connected to the Internet. Glynx can then be run by typing 'perl glynx.pl [options] http://...' in a command prompt or terminal window. It can also be run with 'perl glynx.pl [options] --server' and then accessed by an internet browser. How to create a default configuration: Start the program with your command-line options, plus --cfg-save or: Start the program with --cfg-save, then edit glynx.ini file. Proxy, Firewalls 1.Add the setting HTTP_proxy, with your proxy name as the value (you must include "http://" ), followed by a colon and the proxy port, if applicable; e.g., "http://proxy:8080" 2.If you require a user name and/or password to access your proxy, add the settings HTTP_proxy_user and HTTP_proxy_pass, with your user name and password as the respective values. It is also possible to use an FTP proxy. See the Net::FTP documentation. COPYRIGHT Copyright (c) 2000 Flavio Glock . All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. This program was based on examples in the Perl distribution. ----------- =pod SCRIPT CATEGORIES Web Networking CGI =pod OSNAMES any =head1 TO-DO Glynx - a download manager. OTHER PROGRAMS LIKE THIS http://www.w3.org/Robot/ http://langfeldt.net/w3mir/ A robot site: http://www.botspot.com/ The robot exclusion standard is described in http://info.webcrawler.com/mak/projects/robots/norobots.html. SHORT TO-DO - More command-line compatibility with lwp-rget - Complete the user interface GOALS generalize: option to use (external) java and other script languages to extract links configurable file names and suffixes, filesystem limits configurable dump file format plugins more protocols; download streams language support adhere to perl standards: pod documentation, distribution parallelize things and multiple computer support cpu and memory optimizations accept hardware/internet failures: restartable reduce internet traffic: minimize requests, cache everything from perlhack.pod: Keep it fast, simple, and useful. Keep features/concepts as orthogonal as possible (what's orthogonal?). No arbitrary limits (platforms, data sizes, cultures). Keep it open and exciting to use/patch/advocate Perl everywhere. Either assimilate new technologies, or build bridges to them. PROBLEMS (not bugs) - It takes some time to start the program; not practical for small single file downloads. - It should have a graphical front-end; there exists a web front-end. - Hard to install if you don't have Perl or have outdated Perl modules. It works fine with Perl 5.6 modules. - slave mode uses "dump files", and doesn't delete them. - children processes are silent - don't have consoles - doesn't recognize if a site does not accept simultaneous connections - possibly requires Perl 5.6 -- 5.5 doesn't like "our" is this a bug? - web front-end doesn't show status TESTS - test: counting MAX_DOCS with retry - test: redirect 3xx - test: makerel - test: makerel with javascript/java - test: env_ftp - test: unknown protocol is a fatal error (on page links) - test: folded directories - test: escaped save/compare for all URL names BUGS - redirection between sites makes something wrong with "referer" in "translate.bat" - translation file title shows local directory name, instead of url - ftp: 400 (Bad Request) FTP return code 350 Content-Type: text/plain Restarting at 0. - 030: loop doesn't expand (user-interface) - 030: User interface doesn't know base-dir - breaks in multiprocess mode (children ne 0) - if 2 processes get the same dump-file it is deleted. - restart should check if program file-size is stable before restarting. it may be being updated through a slow ftp session - testing busy_files for expiration uses file age when script started (-M) should use age NOW. - busy-dump-files expire in 12h -- should be configurable - modify_file_attrib doesn't make a backup before writing, or locking -- may lose contents. - sometimes an empty packet is received, truncating the communication. - content_base MAY be wrong if query contains / in line and we receive a redirect: $path =~ s|^(.*)\/||g; # remove directory -
    sends aaa='' in the query - log user-interface ip number doesn't work - modify ftp.pm to return "file/link" information -- save "dir" as _index_.htm - download leaks to "/" instead of base-dir - restart/stop must rename .grx._BUSY_ => .grx - slave should spawn if depth > 0 AND filetype = html; - test if dump-file exists - don't overwrite .err - control whether a slave can dump dump-files They could dump after processing all depth>0, AND only if there were any. - problems downloading java referer: http://www.bera.org/ class: http://www.bera.org/java/JambaAnimator.trolley_anim.class - don't show "RENAME: CAN'T FIND" and "URL:" , unless VERBOSE OPTIMIZATIONS - cache the name-list; cache dir-names - use an optional database connection - Persistent connections; - take a look at LWP::ParallelUserAgent - take a look at LWPng for simultaneous file transfers - take a look at LWP::Sitemapper - POE - use eval around things do speed up program loading - speed up search using stacks indexed per directory or per site DOCUMENTATION - make a --install (extract included files, pod). - make-cpan should use binmode - document the short command-line options - FTP proxy USER INTERFACE - Dr. Watson when user asks for a stop -- will let it disabled - Dr. Watson not very happy with "restart" too - "stop" in httpd doesn't work - Linux core-dump when accessing http-server - user-interface has NO security - user authentication, if user-ip not equal server-ip - should support some authentication. Maybe use --auth= - user-interface does almost no test for valid field contents - what happens when choose base-dir in user interface? - probably should not be possible - how to save-config in user interface? - read lynx help, try to make compatible commands - status page - log file: missing time/date and user ip number - how to do user-answered forms? (POST) - maybe an http-client with push - http-client could use http-server as a "proxy" - rename "old" .grx._BUSY_ files to .grx (timeout = 1 day?) option: touch busy file to show activity - scripting option (execute sequentially instead of parallel). POST with interactive mode or from-file - perl/tk front-end; finish web front end - save "to-do" file each 10 minutes, so it can restart. - timed downloads - start/stop hours - option portuguese/english/other - accept --url=http://... - accept --batch=...grx - arrays for $LOOP,$SUBST; accept multiple URL - makerel: make relative links to OTHER sites should be an option - makerel: should work on applets. - put / / on exclude, etc if they don't have - graphical-interface: option iso9660 - option compress-extension: .tar.gz -> .TGZ (for iso9660) - extension .ab---z -> .ABZ - _names_.htm should point to ../_names_.htm ("Up to higher level directory") and to subdir/_names_.htm; header = "Directory listing of ... " - directories should be of type "DIR" - better formatted name-list, as in ftp-dir - make a logo - include all options, help, in graphical interface - graphical interface easier to configure - stop-task in cgi (--restart + delete grx file) PROTOCOL - "Los Alamos Web Server (Unix)" - looks like it is sending empty packets, which the LWP interprets as end of transmission (closes connection). - HTTP/1.0 403 Forbidden 403 BOGUS query range malformed Bad Request - probably caused by range 9999- (missing end-byte of range) - use HTTP::Status for message codes - pass "VERBOSE" to ftp.pm - Fatal: https:// in URI.pm - 301/302: Moved -- should save an intermediate file to keep links working locally - use robot-rules - create variable max-link-len (now is 500 bytes) - improve forms support (read rfc...) - do not press 2 "submits" at the same time; do not press TYPE=RESET - explore "options" - ignore/accept comments: - nested comments??? but accept javascript - should we read vbasic too? - check: 19.4.5 HTTP Header Fields in Multipart Body-Parts Content-Encoding Persistent connections: Connection-header Accept: */*, *.* - pnm protocol: - realvideo, .rpm files, rtsp: -- RFC 2326 - streams - gnutella - 401 Authentication Required, generalize abort-on-error list - install and test "https"; do a how-to. - 401 - auth required -- supply name:pass - implement "If-Range:" - better error handling on protocol error, for page links; wrong link "c:\xxx" is a fatal error - make auth-digest - AUTH should always send nnn:ppp@url for auth-basic (always...) - ftp_proxy - --proxy option, overriding env_proxy The LWP::Simple interface will call env_proxy() for you automatically. Applications that use the $ua->env_proxy() method will normally not use the $ua->proxy() and $ua->no_proxy() methods. PERL - make it a Perl module (crawler, robot?), generic, re-usable. - maybe a "LWP::Restartable" - funny Win-NT error "can't find" something: "The system cannot find the file specified" - active perl installation error - javascript interpreter option OTHER - file locking - send some html requests to slave processes - will require some additional controls. - does Location / Redirect count as a depth level ? - simultaneous download from mirror sites - use ftpsearch, others, to find mirror sites. - name-list for other sites is creating too many empty directories. empty-directories should be created only when necessary, and file names should be stored somewhere else until the directories are created. - should be change file-name after download (in case of mime-type mistakes) - "Are we reprocessing the cache?" should trigger a filter to remove all /_INDEX_.HTM - should make backup when mirroring (option) - finish "my_link" - perl "link" is copying instead of linking, even on linux - use the name-lookup table to make up for links/redirects - lwp-rget "depth" is "0" when we use "1" - Doesn't recreate unix links on "ftp". Should do that instead of duplicating files (same on http redirects). - http server to make distributed downloads across the internet - use eval to avoid fatal errors; test for valid protocols - don't ignore "File:" on dump-file - execute/override download-list-file "File:" option: use --subst=/k:\\temp/c:\\download/ - change the retry loop to a "while" Generalization, user-interface: - --log-headers should be an option - option to understand robot-rules - make .glynx the default suffix for everything - try to support through download-list-file - internal small javascript interpreter - config comment-string in download-list-file - config comment/uncomment for directives - arquivo default para dump sem parametros - "dump-[host]-1"? make backup on overwrite dump - plugins: for each chunk, page, link, new site, level change, dump file change, max files, on errors, retry level change. Opcao: usar callbacks, ou fazer um modulo especializavel. - dump suffix option - use environment - aceitar configuracao --nofollow="shtml" e --follow="xxx" - controle de hora, bytes por segundo - packing for distribution, include rfcs, etc? - installation hints, package version problems (abs_url) - make an object for link-lists - escolher e especializar um existente. - documentar melhor o uso de "\" em exclude e subst - Renomear a extensao de acordo com o mime-type (ou copiar para o outro nome). --on-redirect=rename --on-redirect=copy --on-redirect=link --on-mime=... - tamanho maximo do arquivo recebido - usar: $ua->max_size([$bytes]) - nao funciona com callback - disk full or unaccessible / alternate dir - montar links usando java ? - a biblioteca LWP faz sozinha Redirection 3xx ? - criar arquivo PART com tamanho zero quando da erro 408 - timeout COMMAND LINE OPTIONS - new options in user interface: - form field/value (translate to SUBST) - "--proxy=http:"1.1.1.1",ftp:"1.1.1.1" "--proxy="1.1.1.1" acessar proxy: $ua->proxy(...) Set/retrieve proxy URL for a scheme: $ua->proxy(['http', 'ftp'], 'http://proxy.sn.no:8001/'); $ua->proxy('gopher', 'http://proxy.sn.no:8001/'); - accept empty "--dump" or "--nodump" --backup / --nobackup when mirroring, overwriting dump, or reprocessing links. --max-mb=100 limita o tamanho total do download --nospace permite links com espacos no nome (ver lwp-rget) --include=".exe" --nofollow=".shtml" --follow=".htm" opcoes de inclusao de arquivos (procurar links dentro) --exclude-hreftext= --exclude-buttontext exclude a link by the "href" or "submit" text --full ou --depth=full opcao site inteiro --chunk=128000 --dump-all grava todos os links, incluindo os ja existentes e paginas processadas --post-separator Not implemented, but won't generate fatal errors: --hier Download into hierarchy (not all files into cwd) --iis Workaround IIS 2.0 bug by sending "Accept: */*" MIME header; translates backslashes (\) to forward slashes (/) --keepext=type Keep file extension for MIME types (comma-separated list) --nospace Translate spaces URLs (not #fragments) to underscores (_) --tolower Translate all URLs to lowercase (useful with IIS servers) ------------------ =head1 CHANGES Glynx - a download manager. Version history: 1.033 - eval{} around URI::URL methods 1.032.007 - fix mkdir rights 1.032.006 - fix POST contents (content_length was zero) - fix Cookie save (ignore_discard was false) 1.032.005 - restart in Windows/ActivePerl is better, but doesn't kill parent - doesn't check prefix if doesn't have at least one 1.032.004 - library routines had to be renamed in URI:URL and others 1.032.003 - glynx.pm now use HTTP::Cookies - cookies are explicit set, loaded, and saved 1.032.002 - seek file before write - doesn't work! - write to temp file before commiting - slower but works! - starts with children=0 - it is safer because of broken winnt perl command line 1.032.001 - dump-subdir "accelerates" download 1.032 - fixed restart in win32 - fixed save/restore vars (reset timeout) - file translations does not store repeated short names - split into glynx_file.pm 1.031 - split into glynx.pm - correct "port 21" reprocessing, using quotemeta - does not request partial file if initial byte is zero - preprocess options before inserting dump-file urls - corrected reprocessing of "url/.." 1.030 - default children is zero - better --make-cpan - will go to slave mode if server cannot start - sleep between spawns - compiled with strict - $DEFAULT_PART_SIZE is 16k - busy-dump-files expire in 12h - makes error-dump-file on last retry - stores Content-Length to use in Range requests. - makes unique dump filenames - timeout has a maximum - supports secure (https) GET on windows, even without openssl. 1.029 - accepts simple multipart responses (8bit-byte encoded, single file) - http://host/path/ prefix was http://host in user-interface - corrected user-interface options, help, more/less options - removed file eg/glynx_httpd.pl - corrected slave dumper; dump-file options 1.028 - better interface - corrected interface dump-file - will fork slaves, unless --children=0 - if there is no url in command line, will be server - if there is a url on command line, will not spawn slaves - doesn't try to link files anymore (win98 fatal) - http server has a tick time - better query splitter - make-cpan uses binmode - read/save auth - tested: cookies - codebase .= '/' - user interface calculates prefix 1.027 --server, --port = http server - takes action on 301 and 302 - Moved ... - "Location" base is url, instead of referer - calls equery on make_filename - some hacks to glue parts together and make one big file for CPAN (--make-cpan) - includes ftp.pm in the same file. - POST sends Content-Length 1.026: - simple GET and PUT forms - reject link "c:\" - bigger max-link-len (500 bytes) - slave intervals on SLEEP if it is active; otherwise on TIMEOUT - ftp-dir sends content-location - finds ftp.pm in program's directory - better make-dir - escapes single-quotes reading config-file - corrected: didn't start if had --prefix - resume ftp transfers. Needs the custom ftp.pm module. - base-dir is always absolute - makerel: default is don't make backup - verify each subdirectory for transformations - sites with ports translate back correctly to site:port - make_shorter_name receives untransformed url also - separated pod file - saves Content-Type - can save any file attribute (delimiter is space) - reprocessing the cache is slower, due to relative links reconstruction - might create directories for linked sites, if it is necessary to create reference files 1.025: - correction (again...) in slave mode variables save/restore - included simple web front-end in eg directory 1.024: new options: --name-len-max= Limit filename size --dir-depth-max= Limit directory depth --cookies=FILE --auth= --makerel Make relative links - makerel will make relative links to other sites; will process last depth; save modified page and make a backup of the original page. - better error handling on command line url "protocol error" - use env_proxy - my_link started - $RETRY_TIMEOUT_MULTIPLIER set to 2 1.023: - better redirect, but perl "link" is copying instead of linking - --mirror option (304) - --mediaext option - sets file dates to last-modified 1.022: - multiple --prefix and --exclude seems to be working - uses Accept:text/html to ask for an html listing of the directory when in "ftp" mode. - corrected errors creating directory and copying file on linux 1.021: - uses URI::Heuristic on command-line URL - shows error response headers (if verbose) - look at the 3rd parameter on 206 (when available -- otherwise it gives 500), Content-Length: 637055 --> if "206" this is "chunk" size Content-Range: bytes 1449076-2086130/2086131 --> THIS is file size - prefix of: http://rd.yahoo.com/footer/?http://travel.yahoo.com/ should be: http://rd.yahoo.com/footer/ - included: "wav" - sleep had 1 extra second - sleep makes tests even when sleep==0 1.020: oct-02-2000 - optimization: accepts 200, when expecting 206 - don't keep retrying when there is nothing to do - 404 Not Found error sometimes means "can't connect" - uses "--404-retry" - file read = binmode 1.019: - restart if program was modified (-M $0) - include "mov" - stop, restart 1.018: - better copy, rename and unlink - corrected binary dump when slave - comparacao de tamanho de arquivos corrigida - span e' um comando de css, que funciona como "a" (a href == span href); span class is not java 1.017: - sleep prints dots if verbose. - daemon mode (--slave) - url and input file are optional 1.016: sept-27-2000 - new name "glynx.pl" - verbose/quiet - exponential timeout on retry - storage control is a bit more efficient - you can filter the processing of a dump file using prefix, exclude, subst - more things in english, lots of new "to-do"; "goals" section - rename config file to glynx.ini 1.015: - first published version, under name "get.pl" - rotina unica de push/shift sem repeticao - traduzido parcialmente para ingles, revisao das mensagens 1.014: - verifica inside antes de incluir o link - corrige numeracao dos arquivos dump - header "Location", "Content-Base" - revisado "Content-Location" 1.013: - para otimizar: retirar repeticoes dentro da pagina - incluido "png" - cria/testa arquivo "not-found" - processa Content-Location - TESTAR - achar um site que use - incluido tipo "swf", "dcr" (shockwave) e "css" (style sheet) - corrige http://host/../file gravado em ./host/../file => ./file - retira caracteres estranhos vindos do javascript: ' ; - os retrys pendentes sao gravados somente no final. - (1) le opcoes, (2) le configuracao, (3) le opcoes de novo 1.012: - segmenta o arquivo dump durante o processamento, permitindo iniciar o download em paralelo a partir de outro processo/computador antes que a tarefa esteja totalmente terminada - utiliza indice para gravar o dump; nao destroi a lista que esta na memoria. - salva a configuracao completa junto com o dump; - salva/le get.ini 1.011: corrige autenticacao (prefix) corrige dump le dump salva/le $OUT_DEPTH, depth (individual), prefix no arquivo dump 1.010: resume se o site nao tem resume, tenta de novo e escolhe o melhor resultado (ideia do Silvio) 1.009: 404 not found nao enviado para o dump processa arquivo se o tipo mime for text/html (nao funciona para o cache) muda o referer dos links dependendo da base da resposta (redirect) considera arquivos de tamanho zero como "nao no cache" gera nome _INDEX_.HTM quando o final da URL tem "/". 1.008: trabalha internamente com URL absolutas corrige vazamento quando out-nivel=0 1.007: segmenta o arquivo dump acelera a procura em @processed corrige o nome do diretorio no arquivo dump ----------------- =head1 ACCESSORY SCRIPTS These are some scripts that might help making Glynx more user-friendly. Glynx - a download manager. ACCESSORIES glynx_slave.pl A shortcut for starting glynx in slave mode, to execute jobs. Configuration: $base_dir -- where glynx look for jobs. glynx.cgi User interface, installable in a web-server. glynx_menu.pl Subroutines used by glynx.cgi, very basic. Configuration: $base_dir -- where glynx look for jobs; $log_dir -- where we write the log file. COPYRIGHT Copyright (c) 2000 Flavio Glock . All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. This program was based on examples in the Perl distribution. ----------- =head2 glynx.cgi - a CGI wrapper to run the user-interface #!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # This program was based on examples in the Perl distribution, # mainly from Gisle Aas. # # If you use it/like it, send a postcard to the author. # Find the latest version in CPAN or http://www.pucrs.br/flavio use CGI qw/:standard/; require "./glynx_menu.pl"; &glynx_configure; $query = new CGI; %in = $query->Vars; &glynx_menu(%in); 1; =head2 glynx_httpd.pl - a stand-alone CGI server for running the user-interface. - removed: use --server instead =head2 glynx_slave.pl - a shortcut for running Glynx in daemon mode. #!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # This program was based on examples in the Perl distribution, # mainly from Gisle Aas. # # If you use it/like it, send a postcard to the author. # Find the latest version in CPAN or http://www.pucrs.br/flavio $base_dir = "d:/download_glynx/"; exec "../glynx.pl --slave --base-dir=\"$base_dir\""; 1; =head2 glynx_menu.pl - a small html-based user-interface. #!/usr/bin/perl # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # This program was based on examples in the Perl distribution, # mainly from Gisle Aas. # # If you use it/like it, send a postcard to the author. # Find the latest version in CPAN or http://www.pucrs.br/flavio # THESE ARE SUBROUTINES - THIS FILE IS NOT INTENDED TO BE EXECUTED sub glynx_configure { $base_dir = "d:/download_glynx/"; $log_dir = "d:/download_glynx/"; } sub glynx_menu { my %in = @_; print < Glynx - Download Manager

    Glynx - Download Manager

    EOT $_ = $in{url}; tr/\\/\//; ($in{url}, $resto) = /^(.*?)(\/?)$/; $_ = $in{url}; if ((! /http:/) and (! /ftp:/)) { $_ = "http://" . $_; s/\/\/\//\/\//; } $in{url} = $_; print < URL:
    Depth:
    Prefix:
    Label:
    Other options:
    Obs: 
          URL = http://site/directory/file
    
          Depth 0 = 1 file
          Depth 1 = 1 file + links & images
          Depth 2 = links their images
    
          Prefix = site/directory, limits unnecessary downloads (optional)
    
          Label = job name
    
          Options = (not ready yet)
        
    ENDOFTEXT if (($in{url} ne "") and ($in{url} ne "http\:\/\/")) { print "
    "; $in{depth} = $in{depth} + 0; $in{depth} = 5 if ($in{depth} > 5); $in{label} =~ s/[\s\+]//; $in{label} = "default" unless $in{label}; $in{base} = $in{url} unless $in{base}; open (FILE, ">>${log_dir}glynx.log"); print FILE <$cmd"); print FILE <method, $r->url, $r->header, $r->content, $r->headers_as_string); select $self; ($query) = $url =~ /\?(.*)/; $query = join("\&", $query, $header); # print "(method, url, header, content, headers, query) = # ($method, $url, $header, $content, $headers, $query) "; foreach (split("\&", $query)) { $in{$1} = unescape($2) if /(.*)=(.*)/; } &main::glynx_menu($query, %in); select STDOUT; } package main; our $VERSION = "1.033"; our $progname = $0; our ($prog_dir) = $progname =~ /(.*)[\/\\]/; $prog_dir = "./" unless $prog_dir; # Copyright (c) 2000 Flavio Glock . All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # This is a subroutine -- not to be executed directly use strict; my $MAX_OPTION = 3; # $addr will store client-IP -- we don't have that yet my $addr; # @var isn't used my (@cmd, @opt, @val, @var); sub preprocess_menu_options { my(@a, $i, $tmp); @a = split("\n", &list_options); $i = 0; @cmd = (); foreach(@a) { ($cmd[$i], $opt[$i], $val[$i]) = /^\s*--([\w\-]*)=([\w\-\:\.\/]*)\s*(.*)$/; ($cmd[$i], $val[$i]) = /^\s*--([\w\-]*)\s*(.*)$/ unless $opt[$i]; unless ($cmd[$i]) { $i--; ($tmp) = /^\s\s*(.*)$/; $val[$i] .= "
    $tmp" if $tmp; } else { $var[$i] = ''; $var[$i] = '$' . $cmd[$i] if $cmd[$i]; $var[$i] =~ tr/a-z\-/A-Z\_/; } $i++; } } sub glynx_footer { print <
    Glynx $main::VERSION - Copyright (c) 2000 Flavio Glock. All rights reserved. This program is free software EOT print end_html; } sub glynx_menu_restart { my %in = @_; &main::make_restart; print "

    Restarting Glynx

    "; print "Please wait about one minute before pressing BACK

    "; } sub glynx_menu_help { my %in = @_; my($i); print ""; foreach(0 .. $#cmd) { $i = $_; if ($val[$i]) { # $val[$i] =~ s/\\/\\\\/g; # escape for printing print ""; } else { print "
    $cmd[$i]$val[$i]
    "; print ""; } } print "
    "; } sub glynx_print_menu { my (%in, $query) = @_; my ($option); print <
    Depth:
    Prefix:
    Job name:
    EOT # Other options:
    foreach (1 .. $MAX_OPTION) { # $_ = 1; print " Option: "; $option = $_; # "option$option"; print "\n"; print "
    \n"; } print ""; } sub glynx_menu { my ($query, %in) = @_; print < Glynx - Download Manager

    Glynx - Download Manager

    EOT # print " [ query $query ] "; $_ = $in{url}; tr/\\/\//; # ($in{url}, $resto) = /^(.*?)(\/?)$/; # $_ = $in{url}; if ((! /http.?:/) and (! /ftp:/)) { $_ = "http://" . $_; s/\/\/\//\/\//; } $in{url} = $_; if ($in{do} eq 'Help') { &glynx_menu_help(%in); print ""; goto footer; } elsif ($in{do} eq 'Restart') { &glynx_menu_restart(%in); print "";; print ""; goto footer; } elsif ($in{do} eq 'More') { $MAX_OPTION += 2; goto MENU; } elsif ($in{do} eq 'Less') { $MAX_OPTION -= 2 if $MAX_OPTION > 2; goto MENU; } else { # eq 'Back' MENU: &glynx_print_menu(%in); print "
    "; print " / "; print " options
    "; print " Glynx
    "; } if (($in{do} eq "Download") and ($in{url} ne "") and ($in{url} !~ /^http\:\/*$/)) { print "

    Processing

    "; $in{depth} = $in{depth} + 0; $in{depth} = 5 if ($in{depth} > 5); $in{dump} =~ s/[\s\+]//; $in{dump} = "default" unless $in{dump}; $in{prefix} = &make_prefix_from_url('', $in{url}) unless $in{prefix}; print "URL: $in{url}
    "; print "Depth: $in{depth}
    "; print "Prefix: $in{prefix}
    "; print "Job name: $in{dump}
    "; print "Base-dir: $main::BASE_DIR
    "; open (FILE, ">>${main::BASE_DIR}glynx.log"); print FILE "ip: $addr\n"; print FILE "URL: $in{url}\n"; print FILE "Depth: $in{depth}\n"; print FILE "Prefix: $in{prefix}\n"; print FILE "Dump: $in{dump}\n"; foreach (1 .. $MAX_OPTION) { print FILE "", $in{"option$_"}, ": ", $in{"val$_"}, "\n" if $in{"option$_"} and $in{"val$_"}; } print FILE "\n"; close (FILE); my $cmd = "$main::BASE_DIR$in{dump}.grx"; open (FILE, ">$cmd"); foreach (1 .. $MAX_OPTION) { print FILE "//", $in{"option$_"}, ": ", $in{"val$_"}, "\n" if $in{"option$_"} and $in{"val$_"}; } print FILE "//DEPTH: $in{depth}\n"; print FILE "//DUMP: $in{dump}\n"; print FILE "//PREFIX: $in{prefix}\n"; print FILE "//\n"; print FILE "URL: $in{url}\n"; print FILE "//Referer: .\n"; print FILE "//Depth: $in{depth}\n"; close (FILE); } footer: glynx_footer; } 1; our @Config_Vars = qw/DIR_DEPTH_MAX NAME_LEN_MAX COOKIES AUTH DEPTH TIMEOUT AGENT REFERER INDEXFILE SLEEP OUT_DEPTH BASE_DIR PART_SUFFIX MAX_DOCS INVALID_CHAR LOOP SUBST DUMP DUMP_MAX RETRY_MAX MAKEREL DUMP_SUBDIR/; our @Config_Arrays = qw/PREFIX EXCLUDE/; sub valid_option { my ($opt) = @_; # print " valid_option "; foreach (@Config_Vars, @Config_Arrays) { # print " opt: $opt eq $_
    "; return 1 if ($opt eq $_); } return 0; } # Defaults my $AUTH = ''; my $MAKEREL = 0; my $DUMP_SUBDIR = 0; my $MIRROR = 0; my $MEDIAEXT = 0; my $DEPTH = 0; my $TIMEOUT = 30; my $AGENT = "Mozilla/3.0 (WinNT; I)"; my $REFERER = "."; my $SLEEP = 1; my $OUT_DEPTH = 0; # opcao para maximo de niveis ao sair do site (0 = nao sai) our $BASE_DIR = "."; my $PART_SUFFIX = "._PART_"; my $MAX_DOCS = 10000; my $INVALID_CHAR = '$'; my $COOKIES = ''; my $NAME_LEN_MAX = 30; my $DIR_DEPTH_MAX = 8; my $SERVER_PORT = 8081; my $CHILDREN = 0; my $INDEXFILE = "_INDEX_.HTM"; my @PREFIX = (); my @EXCLUDE = (); # "/\\/tn_|\\?[DSMN]=[AD]|banner|\\.gif/i"; my $LOOP = ""; # "~~~~:1..50"; my $SUBST = ""; # "s/show\\.asp\\?//"; my $DUMP = ""; my $DUMP_MAX = 30; my $RETRY_MAX = 5; my $RETRY_404 = 1; my $SLAVE = 0; my $STOP = 0; my $RESTART = 0; # to-be configurable my $RETRY_TIMEOUT_MULTIPLIER = 2; my $MAKE_BACKUP = 0; my $MAX_TIMEOUT = 3600; my $DEFAULT_PART_SIZE = 4096 * 4; # Defaults de uso interno, nao configuravel my $MAX_TESTE_REPETICAO = 30; # testa os ultimos links antes de incluir na lista my $LIST_SIZE = 3; # tamanho da estrutura de @links = ($url, $referer, $nivel) my $CFG_SAVE = 0; my $DUMP_SUFFIX = ".grx"; my $NOT_FOUND_SUFFIX = "._NOT_"; my $BUSY_SUFFIX = "._BUSY_"; my $DONE_SUFFIX = "._DONE_"; my $GLYNX_SUFFIX = ".glynx"; my $BACKUP_SUFFIX = ".bak"; my $ERROR_SUFFIX = "._ERR_"; my $GET_SEPARATOR = "\?"; # needs escaping my $POST_SEPARATOR = "_X_POST_X_"; my $CFG_FILE = "glynx.ini"; # - at startup, read file-time of $SLAVE_RESTART_FILE. # - do a restart whenever $SLAVE_RESTART_FILE file-time changes. # - exit whenever $SLAVE_STOP_FILE exists. my $SLAVE_STOP_FILE = "_STOP_$GLYNX_SUFFIX"; my $SLAVE_RESTART_FILE = "_RESTART_$GLYNX_SUFFIX"; # deixar fora desta lista: htm html js cgi txt cfm shtml my @DEFAULT_EXCLUDE = qw/wav mov png swf css dcr doc rtf bak ra rm sfw pcx log ps bmp dvi pdf jar java class rar iso bin midi mid mod mpeg mpg mp3 avi jpg jpeg gif gz msi asf zip cdf exe tar/; my $default_exclude = "/\\." . join("\$|\\.", @DEFAULT_EXCLUDE) . "\$/i"; # print "default_exclude: $default_exclude\n"; # data structures my @dump; my @links; my @retry; my @processed; # general control my $New_Program_Date; my $New_Restart; my $Last_Program_Date; my $Last_Restart; my $Dump_index; my $dump_filenum; my $dump_filename; my $dump_nivel_zero; my $dump_mode_error = 0; my $num_docs; my $retry; our @loop; my $http_daemon; # my $ua; my $nlinks; my $just_slave; my $Slave_file; my $prefix; my $new_random_filename; my $dir_index; my $dir_expr; my $dir_busy; my $dir; my $VERBOSE; my $QUIET; my %Main_Config; my $Boundary; my $num_callback; my $filename; # my $real_name; my $KILL_FINISH = 0; # 1; my $KILL_RESTART = 0; # 2; my $KILL_RESTART_PROGRAM_MODIFIED = 0; # 3; my $KILL_STOP = 0; # 4; my $IS_CHILD; my $children; my $SERVER; my @myARGV = @ARGV; $progname =~ s|.*/||; # only basename left $progname =~ s/\.\w*$//; # strip extension if any if ($#ARGV < 0) { print "\nERROR: Missing command line\n\n"; &usage; } # $VERBOSE=1; print " [ READ-CFG: $CFG_FILE ]\n" if $VERBOSE; &read_dump ($CFG_FILE); print " [ READ-CFG: get-options ]\n" if $VERBOSE; &get_options; print " [ AUTH = $AUTH ]\n" if $VERBOSE; print " [ READ-CFG: preprocess-options ]\n" if $VERBOSE; &preprocess_options; &save_Config (\%Main_Config); &show_Config (\%Main_Config) if $VERBOSE; sub get_options { print " [ GET_OPTIONS ]\n" if $VERBOSE; GetOptions( 'depth=i' => \$DEPTH, 'timeout=i' => \$TIMEOUT, 'agent:s' => \$AGENT, 'referer:s' => \$REFERER, 'indexfile=s' => \$INDEXFILE, 'sleep=i' => \$SLEEP, 'out-depth=i' => \$OUT_DEPTH, 'base-dir=s' => \$BASE_DIR, 'part-suffix=s' => \$PART_SUFFIX, 'limit=i' => \$MAX_DOCS, 'invalid-char=s' => \$INVALID_CHAR, 'prefix=s' => \@PREFIX, 'exclude:s' => \@EXCLUDE, 'loop:s' => \$LOOP, 'subst:s' => \$SUBST, 'dump=s' => \$DUMP, 'auth=s' => \$AUTH, 'dump-max=i' => \$DUMP_MAX, 'retry=i' => \$RETRY_MAX, '404-retry!' => \$RETRY_404, # --no404-retry 'slave!' => \$SLAVE, 'verbose!' => \$VERBOSE, 'quiet!' => \$QUIET, 'restart!' => \$RESTART, 'stop!' => \$STOP, 'mirror!' => \$MIRROR, 'mediaext!' => \$MEDIAEXT, 'makerel!' => \$MAKEREL, 'dump-subdir!' => \$DUMP_SUBDIR, 'cookies=s' => \$COOKIES, 'name-len-max=i' => \$NAME_LEN_MAX, 'dir-depth-max=i' => \$DIR_DEPTH_MAX, 'port=i' => \$SERVER_PORT, 'make-cpan!' => \&make_CPAN, 'server!' => \$SERVER, 'children=i' => \$CHILDREN, # subroutines 'version' => \&print_version, 'help' => \&usage, 'cfg-save!' => \$CFG_SAVE, # not implemented, but exist in lwp-rget: 'hier' => \¬_implemented('hier'), 'iis' => \¬_implemented('iis'), 'tolower' => \¬_implemented('tolower'), 'nospace' => \¬_implemented('nospace'), 'keepext=s' => \¬_implemented('keepext'), ) || usage(); } sub preprocess_options { $BASE_DIR = "." if ! $BASE_DIR; $BASE_DIR =~ s/\\/\//g; #print " [ BASE_DIR: $BASE_DIR ]\n" if $VERBOSE; unless (-d $BASE_DIR) { print " [ BASE-DIR: creating $BASE_DIR ]\n" if $VERBOSE; &glynx_file::make_dir($BASE_DIR); } my $dir = abs_path("$BASE_DIR"); # print " [ BASE_DIR: $BASE_DIR + $dir ]\n" if $VERBOSE; $BASE_DIR = $dir; $BASE_DIR .= "/" if ! ($BASE_DIR =~ /\/$/); print " [ BASE_DIR: $BASE_DIR ]\n" if $VERBOSE; @loop = split(":",$LOOP); print " [ LOOP: $LOOP ]\n" if $VERBOSE; $TIMEOUT = $MAX_TIMEOUT if $TIMEOUT > $MAX_TIMEOUT; print " [ TIMEOUT: $TIMEOUT ]\n" if $VERBOSE; } sub save_Config { my ($hashref) = @_; print " [ SAVE-CONFIG ]\n" if $VERBOSE; no strict "refs"; foreach(@Config_Vars) { eval "\$\$hashref{$_} = \$" . $_; } foreach(@Config_Arrays) { eval "\$\$hashref{$_} = [ \@" . $_ . " ]"; } use strict "refs"; } sub retrieve_Config { my ($hashref) = @_; print " [ RETRIEVE-CONFIG ]\n" if $VERBOSE; no strict "refs"; foreach(@Config_Vars) { eval "\$" . $_ . " = \$\$hashref{$_}"; } foreach(@Config_Arrays) { eval "\@" . $_ . " = \@{\$\$hashref{$_}}"; } use strict "refs"; } sub show_Config { my ($hashref) = @_; print " [ SHOW-CONFIG ]\n" if $VERBOSE; foreach(@Config_Vars) { print " [ $_: ", $$hashref{$_} , " ]\n" if $VERBOSE; } foreach(@Config_Arrays) { print " [ $_: ", join(',', @{$$hashref{$_}} ) , " ]\n" if $VERBOSE; } } if ($CFG_SAVE) { &cfg_save_default; exit 0; } my ($url, $referer, $nivel); $url = shift; # optional url or input file $SERVER = 1 unless $url; $IS_CHILD = 0; if ($CHILDREN and ($SERVER or $SLAVE)) { # PARENTAL CONTROL my $pid; $children = $CHILDREN; SPAWN: if ($^O =~ /win32/i) { # win32 uses "start" my $cmd = "start \"Glynx-$children\" /MIN /LOW \"$^X\" \"$0\" --slave --children=0"; print "$cmd\n" if $VERBOSE; print " [ FORKING: $children ]\n" if $VERBOSE; `$cmd`; $children--; sleep(20); # go easy on CPU goto SPAWN unless $children <= 0; } else { undef $pid; if (!defined($pid = fork)) { print " [ cannot fork: $! ]\n" unless $QUIET; # exit 0; } elsif ($pid) { print " [ FORKING: $children ]\n" if $VERBOSE; $children--; sleep(20); # go easy on CPU goto SPAWN unless $children <= 0; # print "begat $pid"; # print "I'm the parent"; } else { # print "I'm the child"; # be a very quiet slave: $SLAVE = 1; $SERVER = 0; $QUIET = 1; $VERBOSE = 0; $IS_CHILD = 1; } } } # end: spawn children print " [ $progname.pl Version $VERSION ]\n" if $VERBOSE; print " [ URL = $url ]\n" if $VERBOSE; unless ($url =~ /$DUMP_SUFFIX$/i) { $url = uf_urlstr($url); } our $main_url = $url; print " [ URL = $url ]\n" if $VERBOSE; print " [ LOOP = " , join(" ", @loop), " ]\n" if $VERBOSE; # print " [ STOP ]\n" if $STOP; $Last_Restart = -M "$BASE_DIR/$SLAVE_RESTART_FILE"; # print " [ LAST-RESTART: $Last_Restart ]\n" if $VERBOSE; $Last_Program_Date = -M $0; # print " [ LAST-PROGRAM-DATE: $0 = $Last_Program_Date ]\n" if $VERBOSE; if ($SERVER and ! $SLAVE) { &http_server; print " [ Can't start http daemon at port $SERVER_PORT ]\n" unless $QUIET; print " [ Starting slave mode ]\n" unless $QUIET; $SLAVE = 1; $SERVER = 0; } usage() if @ARGV; &make_restart if $RESTART; &make_stop if $STOP; &my_main; sub my_main { my $u1; # estrutura de @links = ($url, $referer, $nivel, ...) @links = (); # coleta links para serem visitados ($url, $referer, $nivel, ...) $dump_nivel_zero = 1; # if $DUMP, save last level. Reset if $SLAVE. $Slave_file = ""; @dump = (); # gera o arquivo dump (mesma estrutura de @links) SLAVE_LOOP: @retry = (); # arquivos incompletos, para tentar novamente @processed = (); # links ja visitados (lista simples) $num_docs = 0; $dump_filenum = 0; $retry = $RETRY_MAX; # $prefix = $PREFIX[0]; $Dump_index = 0; if ( ($url =~ /$DUMP_SUFFIX$/i) and !($url =~ /:/) ) { # DUMP: # verifica se o nome corresponde a um arquivo dump if (-e "$url") { $dump_filename = "$url"; } elsif (-e "$BASE_DIR$url") { $dump_filename = "$BASE_DIR$url"; } elsif (-e "$url$DUMP_SUFFIX") { $dump_filename = "$url$DUMP_SUFFIX"; } elsif (-e "$BASE_DIR$url$DUMP_SUFFIX") { $dump_filename = "$BASE_DIR$url$DUMP_SUFFIX"; } else { die " [ CAN'T FIND INPUT FILE: $url ]" } read_dump($dump_filename); # read_dump($DUMP) if $DUMP; # evita perder informacao ??? } elsif ($url) { # URL: # pega o nome do site $REFERER = $url unless $REFERER; print " [ URL: abs: $url ]\n" if $VERBOSE; eval { $u1 = URI::URL->new_abs($url, $REFERER) }; #$myhost = $u1->host; #print "Host: $myhost\n"; unless ($#PREFIX >= 0) { print " [ PREFIX: abs: $PREFIX[0] ]\n" if $VERBOSE; $prefix = &make_prefix_from_url($PREFIX[0], $u1); @PREFIX = ($prefix); print " [ PREFIX: @PREFIX ]\n" unless $QUIET; } &insert_url ($url, $REFERER, $DEPTH); } else { # print " [ NO URL ]\n" unless $QUIET; } download_links_retry: while (@links) { if ($num_docs >= $MAX_DOCS) { print " [ FIM: num_docs > $MAX_DOCS ]\n" if $VERBOSE; last; } #print "LINKS $#links -- $url --"; ($url, $referer, $nivel) = shift_list(\@links); $nlinks = ($#links + 1) / $LIST_SIZE; last if $nlinks > $MAX_DOCS; # print " ($url, $referer, $nivel [$nlinks] \n"; download($url, $referer, $nivel); print " [ STATUS: READ:", $#processed + 1, "/", +(($#links + 1) / $LIST_SIZE) + $#processed + 1, " LATER:", +($Dump_index) / $LIST_SIZE, "/", + ($#dump + 1) / $LIST_SIZE, " DEPTH:", $DEPTH - $nivel, "/", $DEPTH, " ]\n" unless $QUIET; # time to make a partial dump? if ( $DUMP and $DUMP_MAX and (($#dump - $Dump_index) > ($DUMP_MAX * $LIST_SIZE) ) ) { &dump; } } # RETRY? if (($#retry >= 0) and ($retry > 1)) { print " [ RETRY: LEVEL:", $RETRY_MAX - $retry + 2, "/$RETRY_MAX URL:", +($#retry + 1) / $LIST_SIZE, " ]\n" unless $QUIET; $retry--; @links = @retry; @processed = (); @retry = (); # @dump = (); # $Dump_index = 0; $RETRY_TIMEOUT_MULTIPLIER = 1 if $RETRY_TIMEOUT_MULTIPLIER < 1; $RETRY_TIMEOUT_MULTIPLIER = 10 if $RETRY_TIMEOUT_MULTIPLIER > 10; $TIMEOUT *= $RETRY_TIMEOUT_MULTIPLIER; $TIMEOUT = $MAX_TIMEOUT if $TIMEOUT > $MAX_TIMEOUT; glynx::timeout($TIMEOUT); print " [ RETRY: TIMEOUT:", int($TIMEOUT), " ]\n" if $VERBOSE; goto download_links_retry; } else { if ($#retry < 0) { print " [ DONE: DON'T NEED TO RETRY ]\n" if $VERBOSE; } elsif ($retry > 1) { print " [ FAILED: URL:$retry ]\n" if $VERBOSE; } else { print " [ FAILED: WILL NOT RETRY ]\n" if $VERBOSE; $dump_mode_error = 1; } } # append pending retrys to dump print " [ DUMP: Move ", +($#retry + 1) / $LIST_SIZE, " from Retry to Dump ]\n" if $VERBOSE; @dump = (@dump, @retry); print " [ DUMP: [$DUMP] ", +($#dump + 1) / $LIST_SIZE, " ]\n" if $VERBOSE; while (($Dump_index <= $#dump) and $DUMP) { &dump; } $dump_mode_error = 0; # check for Slave mode $just_slave = 1; SLAVE_IDLE: while ($SLAVE) { if ($Slave_file ne "") { # done &glynx_file::my_rename ("$Slave_file$BUSY_SUFFIX", "$Slave_file$DONE_SUFFIX") if -e "$Slave_file$BUSY_SUFFIX"; } else { #print " [ SLAVE: unknown slave file $Slave_file ]\n"; } # timer # get_options; # read_dump ($CFG_FILE); &retrieve_Config (\%Main_Config); &show_Config (\%Main_Config) if $VERBOSE; if (! $SLAVE) { print " [ SLAVE: CANCELLED ]\n" unless $QUIET; last SLAVE_IDLE; } if ($just_slave) { print " [ SLAVE: IDLE FOR $SLEEP SEC ]\n" unless $QUIET; &my_sleep ($SLEEP); $just_slave = 0; } else { print " [ SLAVE: IDLE FOR $TIMEOUT SEC ]\n" unless $QUIET; &my_sleep ($TIMEOUT); } # what's in dir? $dir_expr = "$BASE_DIR"; opendir DIR, $dir_expr or die " [ SLAVE: CAN'T OPEN $dir_expr ]\n"; my @dir = readdir(DIR); print " [ SLAVE: DIR: $BASE_DIR -- ", join(',',@dir), " ]\n" if $VERBOSE; my @busy_files = grep { (/$BUSY_SUFFIX$/) and (-f "$BASE_DIR$_") } @dir; @dir = grep { (/$DUMP_SUFFIX$/i) and (-f "$BASE_DIR$_") } @dir; closedir DIR; print " [ SLAVE: $dir_expr: $DUMP_SUFFIX -- ", join(',',@dir), " ]\n" if $VERBOSE; $dir_index = 0; # test one of busy_files for expiration -- make it available for slaves my $random_file = int(rand(1 + $#busy_files)); my $random_filename = $BASE_DIR.$busy_files[$random_file]; # print " [ TEST BUSY: $BUSY_SUFFIX -- $random_filename $random_file ", join(",",@busy_files)," ]\n" if $VERBOSE; if ((-M $random_filename) > 1/2) { $new_random_filename = $random_filename; $new_random_filename =~ s/$BUSY_SUFFIX$//; &glynx_file::my_rename($random_filename, $new_random_filename); &glynx_file::my_touch($new_random_filename); } SLAVE_TEST_DIR: while ($#dir >= $dir_index) { # rename file $dir = "$BASE_DIR$dir[$dir_index]"; $dir_busy = "$dir$BUSY_SUFFIX"; if (-e $dir_busy) { print " [ SLAVE: $dir busy ]\n" if $VERBOSE; if (-e $dir) { # both exist -- delete one &glynx_file::my_unlink ($dir_busy); } if (-e $dir_busy) { $dir_index++; next SLAVE_TEST_DIR; } } &glynx_file::my_rename ($dir, $dir_busy); # check again unless (-e ($dir_busy)) { print " [ SLAVE: can't rename $dir ]\n" unless $QUIET; next SLAVE_TEST_DIR; } unless (-s ($dir_busy)) { print " [ SLAVE: $dir empty ]\n" unless $QUIET; next SLAVE_TEST_DIR; } # read dump file $dump_nivel_zero = 0; @dump = (); # gera o arquivo dump (mesma estrutura de @links) read_dump($dir_busy); $Slave_file = $dir; print " [ SLAVE: processing $Slave_file ]\n" unless $QUIET; last SLAVE_IDLE } # dir ok } # slave if ($SLAVE) { print " [ SLAVE: continue processing $Slave_file ]\n" if $VERBOSE; $url = ""; $dump_nivel_zero = 0; # download level zero, even if $DUMP goto SLAVE_LOOP; } print " [ END ]\n" unless $QUIET; } # my_main sub make_prefix_from_url { my ($seed_prefix, $u1) = @_; my ($eval_err); eval { $prefix = URI::URL->new_abs($seed_prefix, $u1) }; print " [ PREFIX: Generated-from: $seed_prefix, $u1 ]\n" if $VERBOSE; print " [ PREFIX: Generated: $prefix ]\n" if $VERBOSE; # clear fragment, query... # test for invalid protocol eval{$prefix->userinfo('')}; if ($eval_err = $@) { print " [ PREFIX: Error: $eval_err ]\n"; print " [ PREFIX: Error: Possible cause: invalid protocol ]\n" if $VERBOSE; return $u1; } my ($prefix, $separator, $query) = split_query_from_url($prefix); print " [ PREFIX: new: $prefix ]\n" if $VERBOSE; eval { $prefix = URI::URL->new($prefix) }; $prefix->fragment(''); # removes file name unless ($prefix =~ /\/$/) { ($prefix) = $prefix =~ /^(.*\/)/; # print " [ PREFIX: new: $prefix ]\n" if $VERBOSE; } # removes authentication if ($prefix =~ /\@/) { ($prefix) = $prefix =~ /.*\@(.*)/; print " [ PREFIX: new: $prefix ]\n" if $VERBOSE; } return $prefix; } sub my_sleep { my ($time) = @_; print " [ SLEEP $SLEEP " unless $QUIET; foreach ( 1 .. $time ) { &check_stop; sleep 1; print "." unless $QUIET; } &check_stop; print " done ]\n" unless $QUIET; } sub make_stop { # - do a restart whenever $SLAVE_RESTART_FILE file-time changes. print " [ MAKE-STOP ]\n" if $VERBOSE; &glynx_file::my_unlink("$BASE_DIR/$SLAVE_STOP_FILE"); &glynx_file::my_unlink("$BASE_DIR/$SLAVE_RESTART_FILE"); &glynx_file::my_create_empty("$BASE_DIR/$SLAVE_STOP_FILE"); } sub make_restart { # - exit whenever $SLAVE_STOP_FILE exists. print " [ MAKE-RESTART ]\n" if $VERBOSE; &glynx_file::my_unlink("$BASE_DIR/$SLAVE_STOP_FILE"); &glynx_file::my_unlink("$BASE_DIR/$SLAVE_RESTART_FILE"); &glynx_file::my_create_empty("$BASE_DIR/$SLAVE_RESTART_FILE"); } sub my_stop { $http_daemon = undef; exit 0 if ($IS_CHILD); sleep (30); # wait a bit for children exit 0; } sub my_exec { $http_daemon = undef; exit 0 if ($IS_CHILD); # children first sleep (5 + int(rand(1 + 15))); # wait a bit for children # do not respawn $_ = $_[0]; s/--children=\d*/ /; $_ .= " --children=0 "; if ($^O =~ /win32/i) { # win32 uses "start" my $cmd = "start \"Glynx-restarted\" /MIN /LOW \"$^X\" $_"; print "$cmd\n" if $VERBOSE; `$cmd`; } else { exec $_; } die "done"; } sub check_stop { my ($do_str); # --stop Stop slave # --restart Stop and restart slave # - at startup, read file-time of $SLAVE_RESTART_FILE. # - do a restart whenever $SLAVE_RESTART_FILE file-time changes. # - exit whenever $SLAVE_STOP_FILE exists. # print " [ SLAVE: $SLAVE -- $BASE_DIR/$SLAVE_STOP_FILE ]\n" if $VERBOSE; # not a command-line business return if ! ($SLAVE or $SERVER); # print " [ SLAVE: CHECK STOP ]\n" if $VERBOSE; if (-e "$BASE_DIR/$SLAVE_STOP_FILE") { print " [ SLAVE: STOP ]\n" if $VERBOSE; # exit $KILL_STOP; my_stop; } if (-e "$BASE_DIR/$SLAVE_RESTART_FILE") { $New_Restart = -M "$BASE_DIR/$SLAVE_RESTART_FILE"; # print " [ LAST-RESTART: $Last_Restart -- $New_Restart ]\n" if $VERBOSE; if ($Last_Restart != $New_Restart) { print " [ SLAVE: RESTART ]\n" if $VERBOSE; # exit $KILL_RESTART; $do_str = "$0 " . join(' ', @myARGV); print " [ STARTING $do_str ]\n" if $VERBOSE; print " [ RESTARTING ]\n" unless $QUIET; &my_exec ($do_str); die "done"; } } if (-e $0) { # program modified? $New_Program_Date = -M $0; # print " [ LAST-PROGRAM-DATE: $Last_Program_Date -- $New_Program_Date ]\n" if $VERBOSE; if ($Last_Program_Date != $New_Program_Date) { print " [ SLAVE: RESTART ]\n" if $VERBOSE; # exit $KILL_RESTART_PROGRAM_MODIFIED; $do_str = "$0 " . join(' ', @myARGV); print " [ STARTING $do_str ]\n" if $VERBOSE; print " [ RESTARTING ]\n" unless $QUIET; &my_exec ($do_str); die "done"; } } } # Download List File Format: # [//] space [comment] # [//]tag: space value # [//]any_var_name: space value # Tags: # URL: xxx - URL # Referer: - referrer URL # Depth: - link levels to download from the URL # Reserved, unimplemented tags: # File: xxx -- Absolute path\filename for file (DOS style slashes) # Desc: xxx -- Description # User: xxx -- Username # Pass: xxx -- Password (encrypted) # Alt: xxx -- Alternate URL (multiple) # # names are Case-Sensitive. # "//" is for compatibility with other download managers and may be ommitted. # "//" is read as [!\w\s]* # values may have single-quotes as delimiters. # values may contain single-quotes and spaces. # single-quotes don't need to be escaped. sub read_dump { my ($dump_filename) = @_; my ($var_name, $processed_options); $processed_options = 0; # my (@tmp_prefix); # @tmp_prefix = @PREFIX; # ??? @PREFIX = (); # will use file's prefixes # print " [ DUMP: opening $dump_filename ]\n" if $VERBOSE; if (! -e $dump_filename) { $dump_filename = "$BASE_DIR$dump_filename"; if (! -e $dump_filename) { return } } print " [ DUMP: opening $dump_filename ]\n" if $VERBOSE; open(FILE, $dump_filename) or die " [ DUMP: Can't open $dump_filename ]"; #//OUT_DEPTH: 0 #//PREFIX: http://us.a1.yimg.com/us.yimg.com/ --> ALLOW MULTIPLE #URL: http://us.a1.yimg.com/us.yimg.com/i/ww/m5v2.gif #File: D:\download_getright\us.a1.yimg.com\us.yimg.com\i\ww\m5v2.gif #//Referer: http://www.yahoo.com/ #//Depth: 2 # $dump_nivel_zero = 0; # desabilita, pois todos os arquivos sao nivel zero. # $OUT_DEPTH = 1 if ($OUT_DEPTH < 1) and (! $PREFIX); # nao sei quem e o host... # reset parameters $url = ""; #File: -- not used ??? my $Referer = $REFERER; my $Depth = $DEPTH; foreach() { chomp; my ($cmd, $opt) = split(" ", $_, 2); if ($cmd =~ /URL:/i) { # $prefix = $PREFIX[0]; unless ($processed_options) { &preprocess_options; # must do this once before processing url $processed_options = 1; } &insert_url ($url, $Referer, $Depth) if $url; # reset parameters $url = $opt; #File: -- not used ??? # ? $referer = $Referer; # ? $depth = $Depth; $Referer = $REFERER; $Depth = $DEPTH; } elsif ($cmd =~ /(\w*):/) { $var_name = $1; $opt = $1 if $opt =~ /^'(.*)'\s*$/; # remove delimiters $opt =~ s/'/\\'/; # escape other delimiters $opt = "'" . $opt . "'"; # put delimiters back if (grep { /^$var_name$/ } @Config_Arrays) { eval "push @" . $var_name . ", $opt"; print " [ CFG: \$$var_name = ", eval "\@" . $var_name . "[-1]", " ]\n" if $VERBOSE; } else { # if ($var_name ne "DUMP") { eval "\$$var_name = $opt"; print " [ CFG: \$$var_name = $opt ]\n" if $VERBOSE; } } } close(FILE); # last one ... &insert_url ($url, $Referer, $Depth) if $url; &preprocess_options; } sub dump { return unless ($DUMP); my ($filename, $name, $dir, $dump_filename, $dump_links); my ($url, $referer, $nivel); $dump_links = 0; # cria um diretorio absoluto $dir = abs_path("$BASE_DIR"); #print "$dir\n"; do { $dump_filenum++; $dump_filename = "$dir/$DUMP"; $dump_filename .= $DUMP_SUFFIX if ! ($dump_filename =~ /$DUMP_SUFFIX$/); $dump_filename =~ s/(.*)\.(.*)/$1-$dump_filenum\.$2/ if $DUMP_MAX; } until ((! -e "$dump_filename") and (! -e "$dump_filename$BUSY_SUFFIX") and (! -e "$dump_filename$DONE_SUFFIX")); $dump_filename .= $ERROR_SUFFIX if $dump_mode_error; print " [ DUMP: $dump_filename ]\n" unless $QUIET; if ($#dump < 0) { print " [ DUMP: EMPTY ]\n" unless $QUIET; &glynx_file::my_unlink ($dump_filename); return; } cfg_save($dump_filename); open (FILE, ">>$dump_filename"); while ($Dump_index <= $#dump) { $url = $dump[$Dump_index++]; $referer = $dump[$Dump_index++]; $nivel = $dump[$Dump_index++]; print " [ WRITE: $url ]\n" if $VERBOSE; $name = &make_filename($url); $filename = "$dir/$name"; if (-e $filename) { if (-d $filename) { print " [ ja existe diretorio: $filename ]\n" if $VERBOSE; $filename .= '/' . $INDEXFILE; print " [ trying: $filename ]\n" if $VERBOSE; next if (-s $filename); } elsif (-s $filename) { print " [ ja existe: $filename ]\n" if $VERBOSE; next; } } $filename =~ s/\//\\/g; print FILE <= $DUMP_MAX); } close (FILE); print " [ DUMP: finish ]\n" if $VERBOSE; } # end: dump sub cfg_save_default { cfg_save($CFG_FILE); } sub cfg_save { my ($filename) = @_; # my ($tmp_prefix); my ($var_name); my $file = $filename; if (-e $filename) { } elsif (-e "$BASE_DIR$filename") { $file = "$BASE_DIR$filename"; } open(FILE, ">$file") or open(FILE, ">$filename") or open(FILE, ">$BASE_DIR$filename") or die " [ Can't write config to $file ]\n"; # Write out actual prefix in use, instead of the (maybe null) config prefix. # Otherwise it may happen that the links will be rejected as "out" when read. # @tmp_prefix = @PREFIX; # $PREFIX[0] = $prefix; print FILE <new($u1); $host = $u1->host; $port = $u1->port; $path = $u1->path; }; # print " [ NAME: (host) $host (path) $path $separator (query) $query ]\n" if $VERBOSE; return &make_filename_from_parts($host, $port, $path, $separator, $query); } sub check_translation_url { my ($short_filename, $short_parent) = @_; my ($trans_filename, @a, $tr_str, $new_name); my (@parent_parts, $local_parent, $my_parent, $local_url); # split parent and test path translations... ??? ($local_parent) = $short_parent =~ /$BASE_DIR(.*)/; @parent_parts = split('/', $local_parent); push @parent_parts, $short_filename; # print " [ check_translation_url: ", join(" ", @parent_parts), " ]\n" if $VERBOSE; @a = split(quotemeta($INVALID_CHAR), $parent_parts[0]); # print " [ check_translation_url: $INVALID_CHAR = ", join(" $INVALID_CHAR ", @a), " ]\n" if $VERBOSE; if ($#a == 1) { $parent_parts[0] = join(':', @a); } UP: foreach (1 .. $#parent_parts) { if (($_ == 1) and ($parent_parts[$_] eq '..')) { print " [ SPLICE-FROM: ", join(" ", @parent_parts), " ]\n" if $VERBOSE; splice (@parent_parts, $_, 1); print " [ SPLICE-TO: ", join(" ", @parent_parts), " ]\n" if $VERBOSE; redo UP; } elsif ($parent_parts[$_] eq '..') { print " [ SPLICE-FROM: ", join(" ", @parent_parts), " ]\n" if $VERBOSE; splice (@parent_parts, $_ - 1, 2); print " [ SPLICE-TO: ", join(" ", @parent_parts), " ]\n" if $VERBOSE; redo UP; } } # check that each parent exists, or that it has an alias. $local_parent = $BASE_DIR; $local_url = ""; foreach (0 .. $#parent_parts) { my $interim_name = &glynx_file::check_translation_url_node($parent_parts[$_], $local_parent); unless ($interim_name) { $my_parent = $local_parent . $parent_parts[$_]; if (-e $my_parent) { # print " [ short: exists: $my_parent ]\n"; $interim_name = $parent_parts[$_]; } else { # print " [ short: not found: $my_parent -- using it anyway ]\n"; # To-do -- ??? (maybe it is ok) $interim_name = $parent_parts[$_]; } } # print " [ short: found $interim_name ]\n"; # $my_parent = $local_parent . $interim_name . '/'; $local_parent .= $parent_parts[$_] . '/'; $local_url .= $interim_name . '/'; } # now put the filename on it # ... $local_url =~ s/${INDEXFILE}\/?$//; $local_parent =~ s/${INDEXFILE}\/?$//; unless (-d "$BASE_DIR$local_parent") { $local_url =~ s/\/$//; $local_parent =~ s/\/$//; # print " [ short: NOT-DIR: $local_parent => $local_url ]\n"; } # print " [ short: END $local_parent => $local_url ]\n" if $VERBOSE; return $local_url; } sub make_shorter_name { my ($filename, $parent, $urlname) = @_; my ($new_name, $trans_filename, @a, $name, $extension, $maxname, $base_name); my ($random_1, $random_2, $rnd); my ($digits); # do we have a name in $NAME_TRANSLATION_FILE ? if ($new_name = &glynx_file::check_translation_file($urlname, $parent)) { $_[0] = $new_name; return; } ($name, $extension) = split('\.',$filename,2); if (length($extension) > 10) { # invalid extension? -- arbitrary limit # print " [ SHORTER-NAME: invalid extension: $extension ]\n" if $VERBOSE; ($name, $extension) = ($filename,''); } $extension =~ tr/\//${INVALID_CHAR}/; # in case this is a joined subdirectory name $maxname = $NAME_LEN_MAX - length($extension) - 1; $maxname = 8 if $maxname < 8; # -- arbitrary limit, again if (length($name) <= $maxname) { # can't do any better? $new_name = $name; $new_name =~ tr/\//${INVALID_CHAR}/; # in case this is a joined subdirectory name $new_name .= '.' . $extension if $extension; } else { print " [ SHORTER-NAME: $name + $extension ]\n" if $VERBOSE; # 4 digits should be enough $digits = 4; # 1000 .. 9999 $random_1 = '1' . ('0' x ($digits - 1)); # 1 => 1000 $random_2 = $random_1 . '0'; # 2 => 10000 # print " formula: int(rand($random_2 - $random_1)) + $random_1 \n"; $maxname = $maxname - $digits + 1; $base_name = substr($name, 0, $maxname); $base_name =~ tr/\//${INVALID_CHAR}/; # in case this is a joined subdirectory name # note: this way of verifying unique MAY be a problem in a multi-process environment do { $rnd = int(rand($random_2 - $random_1)) + $random_1; $new_name = $base_name . $rnd; $new_name .= '.' . $extension if $extension; # check for duplicate names # print " [ SHORTER-NAME: VERIFYING UNIQUE $new_name ]\n" if $VERBOSE; } while grep { /=$new_name>/ } @a; } # log the name-change &glynx_file::log_translation_file($urlname, $new_name, $parent) if $urlname ne $new_name; $_[0] = $new_name; } sub make_filename_from_parts { my ($host, $port, $path, $separator, $query) = @_; my ($name, $urlname, $depth1, @file_names, $parent); # keep original names somewhere my ($url_name, @url_names); my ($url_host, $url_port, $url_path, $url_query) = ($host, $port, $path, $query); $name = $host; $name .= $INVALID_CHAR . $port if ($port != 80) and ($name); $url_name = $url_host; $url_name .= ":" . $url_port if ($port != 80) and ($url_name); $path =~ tr/\\/\//; # \ => / $path =~ s/\/$/\/${INDEXFILE}/g; # final slash => "/$INDEXFILE" $path =~ s/\/\//\//g; # // => / $path =~ s/\/[^\/]*?\/\.\.\//\//g; # /aaa/xxx/../ => /aaa/ $query =~ s/[\\\/\:\*\?\"\<\>\|]/${INVALID_CHAR}/g; # invalid chars $name .= $path; $name =~ s/[\:\*\?\"\<\>\|]/${INVALID_CHAR}/g; $url_path =~ s/\/[^\/]*?\/\.\.\//\//g; # /aaa/xxx/../ => /aaa/ $url_name .= $url_path; $name .= $INVALID_CHAR if $separator; $url_name .= $separator if $separator; $name .= $query if $query; $name =~ s/\.$/\$/; # final dot => invalid char $url_name .= $url_query if $query; # Win-NT charset: # allowed: = & _ - space # not allowed: \ / : * ? " < > | # Win-NT names with dots: # allowed: .* ..* ...* # *.* *..* *...* # not allowed: . .. *. # print " [ NAME: $name => (host) $host (path) $path (query) $query ]\n" if $VERBOSE; # print " [ NAME: $url_name => $name ]\n" if $VERBOSE; @file_names = split("\/", $name); @url_names = split("\/", $url_name, $#file_names + 1); # print " [ NAME: name_depth: $#file_names file_name: $file_names[-1] ]\n" if $VERBOSE; # up to 2 times dir depth reduction, by joining pairs of dir-names. if ($#file_names > $DIR_DEPTH_MAX) { $depth1 = $#file_names - 1; foreach (3 .. $depth1) { # print " process: $_ -- $#file_names -- $DIR_DEPTH_MAX \n"; if (($#file_names > $DIR_DEPTH_MAX) and ($_ <= $#file_names)) { splice(@file_names, -$_, 2, $file_names[-$_] . "/" . $file_names[1-$_]); splice(@url_names, -$_, 2, $url_names[-$_] . "/" . $url_names[1-$_]); } } } # again... if ($#file_names > $DIR_DEPTH_MAX) { $depth1 = $#file_names - 1; foreach (3 .. $depth1) { # print " process: $_ -- $#file_names -- $DIR_DEPTH_MAX \n"; if (($#file_names > $DIR_DEPTH_MAX) and ($_ <= $#file_names)) { splice(@file_names, -$_, 2, $file_names[-$_] . "/" . $file_names[1-$_]); splice(@url_names, -$_, 2, $url_names[-$_] . "/" . $url_names[1-$_]); } } } # check file/dir name length $parent = $BASE_DIR; foreach (0 .. $#file_names) { if ((length($file_names[$_]) > $NAME_LEN_MAX) or ($file_names[$_] =~ /\//)) { # print " [ NAME: CHANGE: $url_names[$_] to $file_names[$_] at $parent ]\n"; &make_shorter_name($file_names[$_], $parent, $url_names[$_]); # print " [ NAME: NOW IS: $file_names[$_] ]\n"; } $parent .= "/" unless $parent =~ /\/$/; $parent .= $file_names[$_]; } $name = join("\/", @file_names); $urlname = join("\/", @file_names); print " [ NAME: name_depth: $#file_names file_name: $file_names[-1] name: $name ]\n" if $VERBOSE; return $name; } sub download { my ($url, $referer, $nivel) = @_; my ($u1, $msg, $new_file_location, $content_location, $rel_filename, $count, $Full_Text, $lm, $new_filename, $file_depth, $base_filename, $url_filename, $original, $uri_filename, $prev, $cache_filename, $method, $new_file_uri, $form_method, $form_request, @form_query, $downloaded, $parent, $form_action, $item, $value, @tags, @scripts, @a, $mime_text_html, @form_names, @forms, $url_path, $Content_Type, $suffix, @suffix, $content_base, $path, $path1, $path2, $new_url, $separator, $url_no_query, $location, $location, $content_range, $content_size, $content_difference, $file_size, $data, $INET, $query, $res, $base, $content_end, $content_begin, $filesize, $mtime, $url_base, $content_length, $url_base, $meio1, $meio2, $name, $content, ); my $main_depth = split('/', $PREFIX[0]); $mime_text_html = 0; # should process links? $downloaded = 0; # new file or cache? $Content_Type = ''; $u1 = $url; # cuida para ficar neste host # $OUT_DEPTH == 0 - nao faz download externo # $OUT_DEPTH == 1 - faz download mas nao segue (nivel zero) if ( ($#PREFIX >= 0) and not grep { $url =~ /$_/ } @PREFIX) { print " [ OUT ", join(",",@PREFIX), " DEPTH:$nivel OUT-DEPTH:$OUT_DEPTH ]\n" if $VERBOSE; return if $OUT_DEPTH < 1; $nivel = $OUT_DEPTH - 1 if $nivel >= $OUT_DEPTH; return if $nivel < 0; print " [ OUT: DEPTH => $nivel ]\n" if $VERBOSE; } # controle do que ja foi visitado $meio1 = $#processed / 3; $meio2 = $meio1 + $meio1; foreach (0 .. $meio1, $meio2 .. $#processed, +($meio1 + 1) .. +($meio2 - 1)) { if ($processed[$_] eq $url) { print " [ DID ]\n" if $VERBOSE; return; } } push @processed, $url; if ($DUMP_SUBDIR and $nivel > 0) { # my $this_prefix = &make_prefix_from_url($main_url, $url . '_'); my $this_prefix = &make_prefix_from_url($url . '_', ''); my $this_depth = split('/', $this_prefix); print " [ THIS-PREFIX: $this_prefix -- $PREFIX[0] : $this_depth -- $main_depth ]\n" if $VERBOSE; if ($this_depth > $main_depth) { # dump subdir print " [ DUMP-SIZE: $Dump_index $#dump ]\n" if $VERBOSE; &dump if $#dump > $Dump_index; my $temp_prefix = $PREFIX[0]; $PREFIX[0] = $this_prefix; print " [ DUMP-THIS: $this_prefix ]\n" if $VERBOSE; push_list (\@dump, $url, $referer, $nivel); &dump if $#dump > $Dump_index; $PREFIX[0] = $temp_prefix; return; } } $name = &make_filename($url); # print " [ REAL_NAME: $real_name -- URL: $url ]\n" if $VERBOSE; $filename = "$BASE_DIR$name"; # $filename is global if (-e "$filename$NOT_FOUND_SUFFIX") { print " [ NOT-FOUND: ja existe $filename$NOT_FOUND_SUFFIX ]\n" if $VERBOSE; return; } $mtime = 0; if (-e $filename) { if (-d $filename) { print " [ DIR EXISTS: $filename ]\n" if $VERBOSE; $filename .= '/' . $INDEXFILE; print " [ CREATE FILE: $filename ]\n" if $VERBOSE; unless ($MIRROR) { if (-s $filename) { # URL should have ending "/" ($path, $separator, $query) = split_query_from_url($url); # ($path, $query) = split('\?', $url, 2); $url = $path . '/' . $query if ! ($path =~ /\/$/); goto DOWNLOAD_OK; } } } elsif (-s $filename) { print " [ FILE EXISTS: $filename ]\n" if $VERBOSE; unless ($MIRROR) { goto DOWNLOAD_OK; } } $mtime = (stat($filename))[9]; } &glynx_file::make_dir("$BASE_DIR$name"); # print "Download: $url\n"; if ($DUMP and ($nivel < 1) and ($dump_nivel_zero)) { print " [$nivel => DUMP]\n" if $VERBOSE; push_list (\@dump, $url, $referer, $nivel); return; } &my_sleep($SLEEP); # GET: $res = glynx::download( url => $url, filename => $filename . $PART_SUFFIX, # real_name => $real_name, agent => $AGENT, timeout => $TIMEOUT, cookie_file => $COOKIES ? "$BASE_DIR$COOKIES" : '', mtime => $mtime, verbose => $VERBOSE, quiet => $QUIET, auth => $AUTH, referer => $referer, post_separator => $POST_SEPARATOR, INDEXFILE => $INDEXFILE, ); # DOWNLOAD FINISHED OR ABORTED my ($status_line) = $res->status_line; my ($return_code) = $status_line =~ /(\d\d\d)/; unless ($return_code == 200){ print " [ RESPONSE: $return_code ERROR <<\n", $res->as_string, " >> RESPONSE ]\n" if $VERBOSE; $msg = $status_line; if (($msg =~ /301/) or ($msg =~ /302/)) { # Moved: should do what "Location:" says my $location = $res->header("Location"); print " [ OK: $msg : $location ]\n" if $VERBOSE; $content_base = $res->header("Content-Base"); if ($location) { if ($content_base) { eval { $u1 = URI::URL->new_abs($location, $content_base) }; } else { eval { $u1 = URI::URL->new_abs($location, $url) }; } &insert_url ($u1, $url, $nivel - 1); } } elsif ($msg =~ /304/) { print " [ OK: 304 NOT MODIFIED ]\n" unless $QUIET; } elsif (($msg =~ /404/) and ($url =~ /(.*)${INDEXFILE}$/)) { # looks like we are re-processing the cache... # try to find out original URL print " [ OOPS: Are we re-processing the cache? Trying $1 ]\n" unless $QUIET; push_list (\@retry, $1, $referer, $nivel); } elsif (($msg =~ /404/) and (! $RETRY_404)) { print " [ ERROR $msg => CANCEL ]\n" unless $QUIET; if (-e "$filename$PART_SUFFIX") { # cria arquivo not-found &glynx_file::my_rename ("$filename$PART_SUFFIX", "$filename$NOT_FOUND_SUFFIX"); } elsif (-e "$filename") { &glynx_file::my_rename ("$filename", "$filename$NOT_FOUND_SUFFIX"); } elsif (-e "$filename$NOT_FOUND_SUFFIX") { } else { &glynx_file::my_create_empty("$filename$NOT_FOUND_SUFFIX"); } } else { print " [ ERROR $msg => LATER ]\n" unless $QUIET; push_list (\@retry, $url, $referer, $nivel); # print " $retry -- push ", join(",", @retry) , " ($url, $referer, $nivel) \n"; } return; } # end: error on download # DOWNLOAD FINISHED AND CORRECT print " [ OK: ", $status_line, " ]\n" if $VERBOSE; &glynx_file::my_rename ("$filename$PART_SUFFIX", "$filename"); &glynx_file::my_unlink ("$filename$PART_SUFFIX-1"); $downloaded = 1; # new file, not cache $num_docs++; print " [ RESPONSE <<\n", $res->as_string, " >> RESPONSE ]\n" if $VERBOSE; #HTTP/1.1 200 OK #Connection: close #Date: Sat, 23 Sep 2000 08:52:22 GMT #Server: Apache/1.3.6 (Unix) #Content-Type: text/html #Content-Type: image/jpeg #Content-Location: http://www.cade.com #Accept-Ranges: bytes #Content-Length: 74623 #Last-Modified: Mon, 17 Apr 2000 18:13:11 GMT $Content_Type = $res->content_type; &glynx_file::modify_file_attrib($filename, 'Content-Type', $Content_Type); # (from: UserAgent.pm) if (my $lm = $res->last_modified) { # make sure the file has the same last modification time utime $lm, $lm, $filename; } # REDIRECT: # Location: indica que um novo documento deve ser obtido # Content-Location: indica o lugar onde este documento esta armazenado # Content-Base: indica o diretorio onde este documento esta a