#!/usr/local/bin/perl -w $version = "960604.25"; $comments = 'jfriedl@omron.co.jp'; ## ## This is "webget" ## ## Jeffrey Friedl (jfriedl@omron.co.jp), July 1994. ## Copyright 19.... ah hell, just take it. ## Should work with either perl4 or perl5 ## ## BLURB: ## Given a URL on the command line (HTTP and FTP supported at the moment), ## webget fetches the named object (HTML text, images, audio, whatever the ## object happens to be). Will automatically use a proxy if one is defined ## in the environment, follow "this URL has moved" responses, and retry ## "can't find host" responses from a proxy in case host lookup was slow. ## Supports users & passwords (FTP), Basic Authorization (HTTP), update-if- ## modified (HTTP), and much more. Works with perl4 or perl5. ## ## More-detailed instructions in the comment block below the history list. ## ## ## To-do: ## Fix up how error messages are passed among this and the libraries. ## Add gopher support. ## (not likely to happen soon.... see ## http://uts.cc.utexas.edu/~zippy/url_get.html ## which apparently supports gopher). ## ############################ ## ## 960604.25 -- no changes, but want to bump out with a new version ## of www.pl which contains an important bugfix. ## 960517.24 should now deal with errors from servers and proxies a bit ## better, exiting with error code 1. ## ## 960219.23 now can have @ in passwords, i.e ## ftp://anon:"jfriedl@omron.co.jp"@rtfm.mit.edu/ ## ACCT ^PASSWORD^^^^^^^^^^ ^HOST^^^^^^^ ## 960217.22 Added support for WebgetHOST, etc. ## 960125.20 Added -spoof. ## Made timeouts more strictly followed. ## (related note: network.pl now has enhanced Linux support) ## 951219.19 Lost ftp connections now die with a bit more grace. ## 951031.16 Added -timeout. ## 951017.15 Neat -pf, -postfile idea from Lorrie Cranor ## (http://www.ccrc.wustl.edu/~lorracks/) ## 950911.13 Added Basic Authorization support for http. ## 950911.12 ## Implemented a most-excellent suggestion by Anthony D'Atri ## (aad@nwnet.net), to be able to automatically grab to a local file of ## the same name as the URL. See the '-nab' flag. ## 950630.10 FTP now works when supplied with a userid & password ## 950426.7 Complete Overhaul: ## Renamed from httpget. Added ftp support (very sketchy at the moment). ## Redid to work with new 'www.pl' library; chucked 'Www.pl' library. ## 941227.5 Added -updateme. Cool! ## 941107.4 Allowed for ^M ending a header line.. ## 940820.3 First sorta'clean net release. ## ## ## ##> ## ## Fetch http and/or ftp URL(s) given on the command line and spit to ## STDOUT. ## ## GENERAL OPTIONS: ## ## -V, -version ## Print version information; exit. ## ## -q, -quiet ## Suppresses all non-essential informational messages. ## ## -h, -help ## Show a usage message and exit. ## ## -d, -debug ## Show some debugging messages. ## ## ## OPTIONS REGARDING STYLE OF REMOTE-SERVER CONTACT: ## ## -p, -post ## If the URL looks like a reply to a form (i.e. has a '?' in it), ## the request is POST'ed instead of GET'ed. ## ## ## -pf, -postfile ## The item after the '?' is taken as a local filename, and the contents ## are POST'ed as with -post ## ## -head ## Gets the header only (for HTTP). This might include such useful ## things as 'Last-modified' and 'Content-length' fields ## (a lack of a 'Last-modified' might be a good indication that it's ## a CGI). ## ## The "-head" option implies "-nostrip", but does *not* imply, ## for example "-nofollow". ## ## -IfNewerThan FILE ## -int FILE ## Only pulls URLs if they are newer than the date the local FILE was ## last written. ## ## -spoof BROWSER ## Some sites return different data for the same page depending upon ## the kind of browser it thinks you are. This is good. But this sux ## if you're trying to get the same type of data with webget and it ## decides to give you different (say, text-only) data. So, you can ## have webget pretend to be a particular browser. A few examples: ## -spoof NCSA_Mosaic ## -spoof SPRY_Mosaic/v8.32 ## -spoof Mozilla/2.0 ## -spoof 'NetManage Chameleon WebSurfer' ## ## If you want to risk having the server delete all your files in spite, ## you could try ## -spoof 'Microsoft Internet Explorer/4.40.474beta (Windows 95)' ## but I sure wouldn't recommend it. ## ## ## OPTIONS REGARDING HOW TO PROCESS WHAT IS RETURNED: ## ## -nab, -f, -file ## Rather than spit the URL(s) to standard output, unconditionally ## dump to a file (or files) whose name is that as used in the URL, ## sans path. I like '-nab', but supply '-file' as well since that's ## what was originally suggested. Also see '-update' below for the ## only-if-changed version. ## ## -nnab ## Like -nab, but in addtion to dumping to a file, dump to stdout as well. ## Sort of like the 'tee' command. ## ## -update, -refresh ## Do the same thing as -nab, etc., but does not bother pulling the ## URL if it older than the localfile. Only applies to HTTP. ## Uses the HTTP "If-Modified-Since" field. If the URL was not modified ## (and hence not changed), the return value is '2'. ## ## -ns, -nostrip ## For HTTP items (including other items going through an HTTP proxy), ## the HTTP response header is printed rather than stripped as default. ## ## ## OPTIONS ABOUT NETWORK/HTTP/FTP ENVIRONMENT: ## ## -nf, -nofollow ## Normally, a "this URL has moved" HTTP response is automatically ## followed. Not done with -nofollow. ## ## -nr, -noretry ## Normally, an HTTP proxy response of "can't find host" is retried ## up to three times, to give the remote hostname lookup time to ## come back with an answer. This suppresses the retries. This is the ## same as '-retry 0'. ## ## -r#, -retry#, -r #, -retry # ## Sets the number of times to retry. Default 3. ## ## -np, -noproxy ## A proxy is not used, even if defined for the protocol. ## ## -timeout TIMESPAN ## -to TIMESPAN ## Time out if a connection can not be made within the specified time ## period. TIMESPAN is normally in seconds, although a 'm' or 'h' may ## be appended to indicate minutes and hours. "-to 1.5m" would timeout ## after 90 seconds. ## ## (At least for now), a timeout causes immediate program death (with ## exit value 3). For some reason, the alarm doesn't always cause a ## waiting read or connect to abort, so I just die immediately.. /-: ## ## I might consider adding an "entire fetch" timeout, if someone ## wants it. ## ## OPTIONS THAT ARE TOO COOL TO FIT IN A GROUPING ABOVE :-) ## ## -updateme ## The special and rather cool flag "-updateme" will see if webget has ## been updated since you got your version, and prepare a local ## version of the new version for you to use. Keep updated! ## ############################ ## ## PASSWORDS AND SUCH FOR FTP ## ## You can use webget to do FTP fetches from non-Anonymous systems and ## accounts. Just put the required username and password into the URL, ## as with ## webget 'ftp://user:password@ftp.somesite.com/pub/pix/babe.gif ## ^^^^^^^^^^^^^ ## Note the user:password is separated from the hostname by a (the last, ## if more than one) '@'. ## ## For anonymous FTP (the default for ftp URLs), the password is your ## username@hostname. The username is taken from the first of: ## "WebgetUSER" environment variable ## "USER" environment variable ## or if neither defined, the string "WWWuser" is used. ## The hostname is taken from the first of ## "WebgetHOST" environment variable ## "HOST" environment variable ## or if not defined, various means are used to try to detect your ## hostname (including running `hostname`) ## ## Of course, you can also use the explicit USER:PASSWORD for anonymous ## FTP where you want explicit control of what is sent as your id. Just ## give the username 'anonymous' (or 'anon' or 'ftp' -- webget will supply ## the full "anonymous" for you) and an identification string: ## ## ftp://anon:jfriedl@omron.co.jp@rtfm.mit.edu/.... ## ACCT ^PASSWORD^^^^^^^^^^ ^HOST^^^^^^^ ## ## In any case, you can surround the password with double quotes for ## readablity: ## ftp://anon:"jfriedl@omron.co.jp"@rtfm.mit.edu/.... ## ACCT ^PASSWORD^^^^^^^^^^ ^HOST^^^^^^^ ## ## NOTE: using quotes makes the URL non-standard -- it's ok for webget use, ## but it's not a "real" URL any longer. ## ## Also note that if you use a proxy, it almost certainly won't like being ## given a password with '@' in it. *Webget* knows that the real hostname ## begins after the _last_ '@', but some proxies (i.e the proxy I use :-) ## do not. ## ## PASSWORDS AND SUCH FOR HTTP ## ## You can use the same kind of thing with HTTP, and if so it will provide ## what's know as Basic Authorization. This is >weak< authorization. It ## also provides >zero< security -- I wouldn't be sending any credit-card ## numbers this way (unless you send them my way :-). It seems to be used ## most by providers of free stuff where they want to make some attempt to ## limit access to "known users". ## ## PROXY STUFF ## ## If you need to go through a gateway to get out to the whole internet, ## you can use a proxy if one's been set up on the gateway. This is done ## by setting the "http_proxy" environment variable to point to the ## proxy server. Other variables are used for other target protocols.... ## "gopher_proxy", "ftp_proxy", "wais_proxy", etc. ## ## For example, I have the following in my ".login" file (for use with csh): ## ## setenv http_proxy http://local.gateway.machine:8080/ ## ## This is to indicate that any http URL should go to local.gateway.machine ## (port 8080) via HTTP. Additionally, I have ## ## setenv gopher_proxy "$http_proxy" ## setenv wais_proxy "$http_proxy" ## setenv ftp_proxy "$http_proxy" ## ## This means that any gopher, wais, or ftp URL should also go to the ## same place, also via HTTP. This allows webget to get, for example, ## GOPHER URLs even though it doesn't support GOPHER itself. It uses HTTP ## to talk to the proxy, which then uses GOPHER to talk to the destination. ## ## Finally, if there are sites inside your gateway that you would like to ## connect to, you can list them in the "no_proxy" variable. This will allow ## you to connect to them directly and skip going through the proxy: ## ## setenv no_proxy "www.this,www.that,www.other" ## ## I (jfriedl@omron.co.jp) have little personal experience with proxies ## except what I deal with here at Omron, so if this is not representative ## of your situation, please let me know. ## ## RETURN VALUE ## The value returned to the system by webget is rather screwed up because ## I didn't think about dealing with it until things were already ## complicated. Since there can be more than one URL on the command line, ## it's hard to decide what to return when one times out, another is fetched, ## another doesn't need to be fetched, and a fourth isn't found. ## ## So, here's the current status: ## ## Upon any timeout (via the -timeout arg), webget immediately ## returns 3. End of story. Otherwise.... ## ## If any URL was fetched with a date limit (i.e. via ## '-update/-refresh/-IfNewerThan' and was found to not have changed, ## 2 is returned. Otherwise.... ## ## If any URL was successfully fetched, 0 is returned. Otherwise... ## ## If there were any errors, 1 is returned. Otherwise... ## ## Must have been an info-only or do-nothing instance. 0 is returned. ## ## Phew. Hopefully useful to someone. ##< ## ## Where latest version should be. @WEB_base = ('http://www.wg.omron.co.jp/~jfriedl/perl/', 'http://enterprise.ic.gc.ca/~jfriedl/perl/'); $WEB_normal = $WEB_base[0] . 'webget'; $WEB_inlined = $WEB_base[0] . 'inlined/webget'; require 'network.pl'; ## inline if possible (directive to a tool of mine) require 'www.pl'; ## inline if possible (directive to a tool of mine) $inlined=0; ## this might be changed by a the inline thing. ## ## Exit values. All screwed up. ## $EXIT_ok = 0; $EXIT_error = 1; $EXIT_notmodified = 2; $EXIT_timeout = 3; ## ## { local(@msg); if (!defined($network'version) || $network'version < "960514.7") { push(@msg, qq/upgrading "network.pl" is recommended/); } if (!defined($www'version) || $www'version < "960604.17") { push(@msg, qq/upgrading "www.pl" is recommended/); } if (@msg) { foreach $m (@msg) { warn qq/WARNING:\n$0: $m\n/; } warn((@msg > 1 ? 'These are' : 'It is') . ' available at:\n'); foreach $base (@WEB_base) { warn("\t$base\n"); } } } $WEB = $inlined ? $WEB_inlined : $WEB_normal; $debug = 0; $strip = 1; ## default is to strip $quiet = 0; ## also normally off. $follow = 1; ## normally, we follow "Found (302)" links $retry = 3; ## normally, retry proxy hostname lookups up to 3 times. $nab = 0; ## If true, grab to a local file of the same name. $refresh = 0; ## If true, use 'If-Modified-Since' with -nab get. $postfile = 0; ## If true, filename is given after the '?' $defaultdelta2print = 2048; $TimeoutSpan = 0; ## seconds after which we should time out. while (@ARGV && $ARGV[0] =~ m/^-/) { $arg = shift(@ARGV); $nab = 1, next if $arg =~ m/^-f(ile)?$/; $nab = 1, next if $arg =~ m/^-nab$/; $nab = 2, next if $arg =~ m/^-nnab$/; $post = 1, next if $arg =~ m/^-p(ost)?$/i; $post = $postfile = 1, next if $arg =~ m/^-p(ost)?f(ile)?$/i; $quiet=1, next if $arg =~ m/^-q(uiet)?$/; $follow = 0, next if $arg =~ m/^-no?f(ollow)?$/; $strip = 0, next if $arg =~ m/^-no?s(trip)?$/; $debug=1, next if $arg =~ m/^-d(ebug)?$/; $noproxy=1, next if $arg =~ m/^-no?p(roxy)?$/; $retry=0, next if $arg =~ m/^-no?r(etry)?$/; $retry=$2, next if $arg =~ m/^-r(etry)?(\d+)$/; &updateme if $arg eq '-updateme'; $strip = 0, $head = 1, next if $arg =~ m/^-head(er)?/; $nab = $refresh = 1, next if $arg =~ m/^-(refresh|update)/; &usage($EXIT_ok) if $arg =~ m/^-h(elp)?$/; &show_version, exit($EXIT_ok) if $arg eq '-version' || $arg eq '-V'; if ($arg =~ m/^-t(ime)?o(ut)?$/i) { local($num) = shift(@ARGV); &usage($EXIT_error, "expecting timespan argument to $arg\n") unless $num =~ m/^\d+(\d*)?[hms]?$/; &timeout_arg($num); next; } if ($arg =~ m/^-if?n(ewer)?t(han)?$/i) { $reference_file = shift(@ARGV); &usage($EXIT_error, "expecting filename arg to $arg") if !defined $reference_file; if (!-f $reference_file) { warn qq/$0: ${arg}\'s "$reference_file" not found.\n/; exit($EXIT_error); } next; } if ($arg eq '-spoof') { local($spoof) = shift(@ARGV); &usage($EXIT_error, "expecting arg to $arg\n") unless defined $spoof; $www'useragent = $spoof; #' next; } if ($arg eq '-r' || $arg eq '-retry') { local($num) = shift(@ARGV); &usage($EXIT_error, "expecting numerical arg to $arg\n") unless defined($num) && $num =~ m/^\d+$/; $retry = $num; next; } &usage($EXIT_error, qq/$0: unknown option "$arg"\n/); } if ($head && $post) { warn "$0: combining -head and -post makes no sense, ignoring -post.\n"; $post = 0; undef $postfile; } if ($refresh && defined($reference_file)) { warn "$0: combining -update and -IfNewerThan make no sense, ignoring -IfNewerThan.\n"; undef $reference_file; } if (@ARGV == 0) { warn "$0: nothing to do. Use -help for info.\n"; exit($EXIT_ok); } ## ## Now run through the remaining arguments (mostly URLs) and do a quick ## check to see if they look well-formed. We won't *do* anything -- just ## want to catch quick errors before really starting the work. ## @tmp = @ARGV; $errors = 0; while (@tmp) { $arg = shift(@tmp); if ($arg =~ m/^-t(ime)?o(ut)?$/) { local($num) = shift(@tmp); if ($num !~ m/^\d+(\d*)?[hms]?$/) { &warn("expecting timespan argument to $arg\n"); $errors++; } } else { local($protocol) = &www'grok_URL($arg, $noproxy); #' if (!defined $protocol) { warn qq/can\'t grok "$arg"/; $errors++; } elsif (!$quiet && ($protocol eq 'ftp')) { warn qq/warning: -head ignored for ftp URLs\n/ if $head; warn qq/warning: -refresh ignored for ftp URLs\n/if $refresh; warn qq/warning: -IfNewerThan ignored for ftp URLs\n/if defined($reference_file); } } } exit($EXIT_error) if $errors; $SuccessfulCount = 0; $NotModifiedCount = 0; ## ## Now do the real thing. ## while (@ARGV) { $arg = shift(@ARGV); if ($arg =~ m/^-t(ime)?o(ut)?$/) { &timeout_arg(shift(@ARGV)); } else { &fetch_url($arg); } } if ($NotModifiedCount) { exit($EXIT_notmodified); } elsif ($SuccessfulCount) { exit($EXIT_ok); } else { exit($EXIT_error); } ########################################################################### ########################################################################### sub timeout_arg { ($TimeoutSpan) = @_; $TimeoutSpan =~ s/s//; $TimeoutSpan *= 60 if $TimeoutSpan =~ s/m//; $TimeoutSpan *= 3600 if $TimeoutSpan =~ s/h//; } ## ## As a byproduct, returns the basename of $0. ## sub show_version { local($base) = $0; $base =~ s,.*/,,; print STDERR "This is $base version $version\n"; $base; } ## ## &usage(exitval, message); ## ## Prints a usage message to STDERR. ## If MESSAGE is defined, prints that first. ## If exitval is defined, exits with that value. Otherwise, returns. ## sub usage { local($exit, $message) = @_; print STDERR $message if defined $message; local($base) = &show_version; print STDERR <'.($tempFile="/tmp/webget.new")) || open(STDOUT, '>'.($tempFile="/usr/tmp/webget.new")) || open(STDOUT, '>'.($tempFile="/webget.new")) || open(STDOUT, '>'.($tempFile="webget.new")) || die "$0: can\'t open a temp file.\n"; ## ## See if we can figure out how we were called. ## The seek will rewind not to the start of the data, but to the ## start of the whole program script. ## ## Keep the first line if it begins with #!, and the next two if they ## look like the trick mentioned in the perl man page for getting ## around the lack of #!-support. ## if (seek(DATA, 0, 0)) { ## $_ = ; if (m/^#!/) { print STDOUT; $_ = ; if (m/^\s*eval/) { print STDOUT; $_ = ; if (m/^\s*if/) { print STDOUT; } } } print STDOUT "\n#-\n"; } ## Go get the latest one... local(@options); push(@options, 'head') if $head; push(@options, 'nofollow') unless $follow; push(@options, ('retry') x $retry) if $retry; push(@options, 'quiet') if $quiet; push(@options, 'debug') if $debug; local($status, $memo, %info) = &www'open_http_url(*IN, $WEB, @options); #' die "fetching $WEB:\n $memo\n" unless $status eq 'ok'; $size = $info{'content-length'}; while () { $size -= length; print STDOUT; if (!defined $fetched_version && m/version\s*=\s*"([^""]+)"/) { $fetched_version = $1; &general_read(*IN, $size, 0); last; } } $fetched_version = "" unless defined $fetched_version; ## ## Try to update the mode of the temp file with the mode of this file. ## Don't worry if it fails. ## chmod($mode, $tempFile) if $mode = (stat($0))[2]; $as_well = ''; if ($fetched_version eq $version) { print STDERR "You already have the most-recent version ($version).\n", qq/FWIW, the newly fetched one has been left in "$tempFile".\n/; } elsif ($fetched_version <= $version) { print STDERR "Mmm, your current version seems newer (?!):\n", qq/ your version: "$version"\n/, qq/ new version: "$fetched_version"\n/, qq/FWIW, fetched one left in "$tempFile".\n/; } else { print STDERR "Indeed, your current version was old:\n", qq/ your version: "$version"\n/, qq/ new version: "$fetched_version"\n/, qq/The file "$tempFile" is ready to replace the old one.\n/; print STDERR qq/Just do:\n % mv $tempFile $0\n/ if -f $0; $as_well = ' as well'; } print STDERR "Note that the libraries it uses may (or may not) need updating$as_well.\n" unless $inlined; exit($EXIT_ok); } ## ## Given a list of URLs, fetch'em. ## Parses the URL and calls the routine for the appropriate protocol ## sub fetch_url { local(@todo) = @_; local(%circref, %hold_circref); URL_LOOP: while (@todo) { $URL = shift(@todo); %hold_circref = %circref; undef %circref; local($protocol, @args) = &www'grok_URL($URL, $noproxy); #' if (!defined $protocol) { &www'message(1, qq/can\'t grok "$URL"/); #' next URL_LOOP; } ## call protocol-specific handler local($old_alarm) = $SIG{'ALRM'} || 'DEFAULT'; $func = "fetch_via_" . $protocol; $error = &$func(@args, $TimeoutSpan); $SIG{'ALRM'} = $old_alarm; if (defined $error) { $error =~ s/\n*$/\n/; ## ensure one ending newline. &www'message(1, "$URL: $error"); #' } else { $SuccessfulCount++; } } } sub filedate { local($filename) = @_; local($filetime) = (stat($filename))[9]; return 0 if !defined $filetime; local($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($filetime); return 0 if !defined $wday; sprintf(qq/"%s, %02d-%s-%02d %02d:%02d:%02d GMT"/, ("Sunday", "Monday", "Tuesdsy", "Wednesday", "Thursday", "Friday", "Saturday")[$wday], $mday, ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[$mon], $year, $hour, $min, $sec); } sub local_filename { local($filename) = @_; $filename =~ s,/+$,,; ## remove any trailing slashes $filename =~ s,.*/,,; ## remove any leading path if ($filename eq '') { ## empty -- pick a random name $filename = "file0000"; ## look for a free random name. $filename++ while -f $filename; } $filename; } sub set_output_file { local($filename) = @_; if (!open(OUT, ">$filename")) { &www'message(1, "$0: can\'t open [$filename] for output"); #' } else { open(SAVEOUT, ">>&STDOUT") || die "$!";; open(STDOUT, ">>&OUT"); } } sub close_output_file { local($filename) = @_; unless ($quiet) { local($note) = qq/"$filename" written/; if (defined $error) { $note .= " (possibly corrupt due to error above)"; } &www'message(1, "$note."); #' } close(STDOUT); open(STDOUT, ">&SAVEOUT"); } sub http_alarm { &www'message(1, "ERROR: $AlarmNote."); #' exit($EXIT_timeout); ## the alarm doesn't seem to cause a waiting syscall to break? # $HaveAlarm = 1; } ## ## Given the host, port, and path, and (for info only) real target, ## fetch via HTTP. ## ## If there is a user and/or password, use that for Basic Authorization. ## ## If $timeout is nonzero, time out after that many seconds. ## sub fetch_via_http { local($host, $port, $path, $target, $user, $password, $timeout) = @_; local(@options); local($local_filename); ## ## If we're posting, but -postfile was given, we need to interpret ## the item in $path after '?' as a filename, and replace it with ## the contents of the file. ## if ($postfile && $path =~ s/\?([\d\D]*)//) { local($filename) = $1; return("can't open [$filename] to POST") if !open(IN, "<$filename"); local($/) = ''; ## want to suck up the whole file. $path .= '?' . ; close(IN); } $local_filename = &local_filename($path) if $refresh || $nab || defined($reference_file); $refresh = &filedate($local_filename) if $refresh; $refresh = &filedate($reference_file) if defined($reference_file); push(@options, 'head') if $head; push(@options, 'post') if $post; push(@options, 'nofollow') unless $follow; push(@options, ('retry') x 3); push(@options, 'quiet') if $quiet; push(@options, 'debug') if $debug; push(@options, "ifmodifiedsince=$refresh") if $refresh; if (defined $password || defined $user) { local($auth) = join(':', ($user || ''), ($password || '')); push(@options, "authorization=$auth"); } if ($timeout) { $SIG{'ALRM'} = "main'http_alarm"; # $HaveAlarm = 0; $AlarmNote = "host $host"; $AlarmNote .= ":$port" if $port != $www'default_port{'http'}; #' $AlarmNote .= " timed out after $timeout second"; $AlarmNote .= 's' if $timeout > 1; alarm($timeout); } local($result, $memo, %info) = &www'open_http_connection(*HTTP, $host,$port,$path,$target,@options);#' alarm(0) if $timeout; # if ($HaveAlarm) { # close(HTTP); # $error = "timeout after $timeout second"; # $error .= "s" if $timeout > 1; # return $error; # } if ($follow && ($result eq 'follow')) { %circref = %hold_circref; $circref{$memo} = 1; unshift(@todo, $memo); return undef; } if ($result eq 'error') { $memo .= "\n" . $info{'BODY'} if defined $info{'BODY'}; return $memo; } if (!$quiet && $result eq 'status' && ! -t STDOUT) { #&www'message(1, "Warning: $memo"); #' $error = "Warning: $memo"; } if ($info{'CODE'} == 304) { ## 304 is magic for "Not Modified" alarm($timeout) if $timeout; close(HTTP); alarm(0) if $timeout; &www'message(1, "$URL: Not Modified") unless $quiet; #' $NotModifiedCount++; return undef; ## no error } &set_output_file($local_filename) if $nab; unless($strip) { print $info{'STATUS'}, "\n", $info{'HEADER'}, "\n"; print SAVEOUT $info{'STATUS'}, "\n", $info{'HEADER'}, "\n" if $nab==2; } local($doneread) = 0; if (defined $info{'BODY'}) { print $info{'BODY'}; print SAVEOUT $info{'BODY'} if $nab==2; $doneread = length $info{'BODY'}; } if (!$head) { &general_read(*HTTP, $info{'content-length'}, $doneread, $timeout); } alarm($timeout) if $timeout; close(HTTP); alarm(0) if $timeout; &close_output_file($local_filename) if $nab; $error; ## will be 'undef' if no error; } sub fetch_via_ftp { local($host, $port, $path, $target, $user, $password, $timeout) = @_; local($local_filename) = &local_filename($path); local($ftp_debug) = $debug; local(@password) = ($password); $path =~ s,^/,,; ## remove a leading / from the path. $path = '.' if $path eq ''; ## make sure we have something if (!defined $user) { $user = 'anonymous'; $password = $ENV{'WebgetUSER'} || $ENV{'USER'} || 'WWWuser'; local($host) = $ENV{'WebgetHOST'} || $ENV{'HOST'} || &network'addr_to_ascii(&network'my_addr) || ''; @password = ($password.'@'.$host); push(@password, $password.'@') if $host; } elsif (!defined $password) { @password = (""); } ## allow 'anon' or 'ftp' as shorthands. $user = 'anonymous' if ($user eq 'anon') || ($user eq 'ftp'); local($_last_ftp_reply, $_passive_host, $_passive_port); local($size); sub _ftp_get_reply { alarm($timeout) if $timeout; local($text) = scalar(); alarm(0) if $timeout; die "lost connection to $host\n" if !defined $text; local($_, $tmp); print STDERR "READ: $text" if $ftp_debug; die "internal error: expected reply code in response from ". "ftp server [$text]" unless $text =~ s/^(\d+)([- ])//; local($code) = $1; if ($2 eq '-') { while (1) { alarm($timeout) if $timeout; $_ = ; alarm(0) if $timeout; last if !defined $_; ($tmp = $_) =~ s/^\d+[- ]//; $text .= $tmp; last if m/^$code /; } } $text =~ s/^\d+ ?//g; ($code, $text); } sub _ftp_expect { local($code, $text) = &_ftp_get_reply; $_last_ftp_reply = $text; foreach $expect (@_) { return ($code, $text) if $code == $expect; } die "internal error: expected return code ". join('|',@_).", got [$text]"; } sub _ftp_send { print STDERR "SEND: ", @_ if $ftp_debug; alarm($timeout) if $timeout; print FTP_CONTROL @_; alarm(0) if $timeout; } sub _ftp_do_passive { local(@commands) = @_; &_ftp_send("PASV\r\n"); local($code) = &_ftp_expect(227, 125); if ($code == 227) { die "internal error: can't grok passive reply [$_last_ftp_reply]" unless $_last_ftp_reply =~ m/\(([\d,]+)\)/; local($a,$b,$c,$d, $p1, $p2) = split(/,/, $1); ($_passive_host, $_passive_port) = ("$a.$b.$c.$d", $p1*256 + $p2); } foreach(@commands) { &_ftp_send($_); } alarm($timeout) if $timeout; local($error)= &network'connect_to(*PASSIVE, $_passive_host, $_passive_port);#' alarm(0) if $timeout; die "internal error: passive ftp connect [$error]" if $error; } sub done { local($message) = shift; alarm($timeout) if $timeout; close(FTP_CONTROL); alarm(0) if $timeout; $message; } ## make the connection to the host &www'message($debug, "connecting to $host...") unless $quiet;#' if ($timeout) { $SIG{'ALRM'} = "main'http_alarm"; ## can use this for now # $HaveAlarm = 0; $AlarmNote = "host $host"; $AlarmNote .= ":$port" if $port != $www'default_port{'ftp'};#' $AlarmNote .= " timed out after $timeout second"; $AlarmNote .= 's' if $timeout > 1; alarm($timeout); } local($error) = &network'connect_to(*FTP_CONTROL, $host, $port);#' alarm(0) if $timeout; return $error if $error; local ($code, $text) = &_ftp_get_reply; return &done("internal ftp error: [$text]") if ($code != 220); ## log in &www'message($debug, "logging in as $user...") unless $quiet;#' foreach $password (@password) { &_ftp_send("USER $user\r\n"); ($code, $text) = &_ftp_expect(230,331,530); return &done($text) if $code == 530; last if $code == 230; ## hey, already logged in, cool. &_ftp_send("PASS $password\r\n"); ($code, $text) = &_ftp_expect(220,230,530,550,332); last if $code != 550; last if $text =~ m/can\'t change directory/; } if ($code == 550) { $text =~ s/\n+$//; &www'message(1, "Can\'t log in $host: $text") unless $quiet;#' exit($EXIT_error); } if ($code == 332) { &_ftp_send("ACCT noaccount\r\n"); ($code, $text) = &_ftp_expect(230, 202, 530, 500,501,503, 421) } return &done($text) if $code >= 300; &_ftp_send("TYPE I\r\n"); &_ftp_expect(200); unless ($quiet) { local($name) = $path; $name =~ s,.*/([^/]),$1,; &www'message($debug, "requesting $name...");#' } ## get file &_ftp_do_passive("RETR $path\r\n"); ($code,$text) = &_ftp_expect(125, 150, 550, 530); return &done($text) if $code == 530; if ($code == 550) { alarm($timeout) if $timeout; close(PASSIVE); alarm(0) if $timeout; if ($text =~ /directory/i) { ## probably from "no such file or directory", so just return now. return &done($text); } ## do like Mosaic and try getting a directory listing. &_ftp_send("CWD $path\r\n"); ($code) = &_ftp_expect(250,550); return &done($text) if $code == 550; &_ftp_do_passive("LIST\r\n"); &_ftp_expect(125, 150); } $size = $1 if $text =~ m/(\d+)\s+bytes/; binmode(PASSIVE); ## just in case. &www'message($debug, "waiting for data...") unless $quiet;#' &set_output_file($local_filename) if $nab; &general_read(*PASSIVE, $size, 0, $timeout); &close_output_file($local_filename) if $nab; alarm($timeout) if $timeout; close(PASSIVE); close(FTP_CONTROL); alarm(0) if $timeout; undef; } sub general_read { local(*INPUT, $size2read, $bytesread, $timeout) = @_; local($lastcount) = (0,0); local($need_to_clear) = 0; local($start_time) = time; local($last_time, $time) = $start_time; ## Figure out how often to print the "bytes read" message local($delta2print) = (defined $size2read) ? int($size2read/50) : $defaultdelta2print; $bytesread = 0 if !$bytesread; &www'message(0, "read $bytesread bytes") unless $quiet; #' ## so $! below is set only if a real error happens from now eval 'local($^W) = 0; undef $!'; while (1) { alarm($timeout) if $timeout; $_ = ; alarm(0); last if !defined($_); ## shove it out. &www'clear_message if $need_to_clear; #' print; print SAVEOUT if $nab==2; ## if we know the content-size, keep track of what we're reading. $bytesread += length; last if eof || (defined $size2read && $bytesread >= $size2read); if (!$quiet && $bytesread > ($lastcount + $delta2print)) { if ($time = time, $last_time == $time) { $delta2print *= 1.5; } else { $last_time = $time; $lastcount = $bytesread; local($time_delta) = $time - $start_time; local($text); $delta2print /= $time_delta; if (defined $size2read) { $text = sprintf("read $bytesread bytes (%.0f%%)", $bytesread*100/$size2read); } else { $text = "read $bytesread bytes"; } if ($time_delta > 5 || ($time_delta && $bytesread > 10240)) { local($rate) = int($bytesread / $time_delta); if ($rate < 5000) { $text .= " ($rate bytes/sec)"; } elsif ($rate < 1024 * 10) { $text .= sprintf(" (%.1f k/sec)", $rate/1024); } else { $text .= sprintf(" (%.0f k/sec)", $rate/1024); } } &www'message(0, "$text..."); #' $need_to_clear = -t STDOUT; } } } if (!$quiet) { if ($size2read && ($size2read != $bytesread)) { &www'message(1, "WARNING: Expected $size2read bytes, read $bytesread bytes.\n"); #' } else { &www'clear_message($text); #' } } } sub dummy { 1 || &dummy || &fetch_via_ftp || &fetch_via_http || &http_alarm || $www'useragent #' || close(OUT) || close(SAVEOUT); } __END__