[Kc] HTTP Links
Garrett Goebel
ggoebel at goebel.ws
Mon Aug 7 09:32:53 PDT 2006
Here's a third try for you using:
o Getopt::Long for command line option processing
o LWP::UserAgent to fetch content from uri's instead of a file
o Documentation
#!/usr/bin/perl
use strict;
use warnings;
use version; our $VERSION = qv(0.0.3);
use Getopt::Long;
use Pod::Usage;
use Regexp::Common qw(URI);
use LWP::UserAgent;
use HTML::SimpleLinkExtor;
# Specify and process command line options
my %Config;
GetOptions(
\%Config,
'uri|u=s',
'extension|ext|e=s',
'help|h|?',
'man|m') or pod2usage(2);
pod2usage(1) if $Config{help};
pod2usage(-exitstatus => 0, -verbose => 2) if $Config{man};
pod2usage('--uri is required!') if !$Config{uri};
# Check that uri schema is supported by HTTP::Request for GET requests
my $valid_uri = 0;
for my $scheme (qw(HTTP FTP file NNTP gopher)) {
$valid_uri++ if $Config{uri} =~ /$RE{URI}{$scheme}/;
}
$valid_uri++ if $Config{uri} =~ /$RE{URI}{HTTP}{-scheme => 'https'}/;
pod2usage("invalid or unsupported uri: $Config{uri}") if !$valid_uri;
$Config{extension} ||= '*';
# Fetch content from uri and print extracted links
my $ua = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $Config{uri});
my $response = $ua->request($request);
if (!$response->is_success) {
die("Unable to process request for uri: $Config{uri}:\n",
$response->status_line);
}
my $content = $response->content;
my $extor = HTML::SimpleLinkExtor->new;
$extor->parse($content);
foreach my $link ($extor->links) {
next if $link !~ m/\.$Config{extension}/ixms;
print "$link\n";
}
1;
__END__
=head1 NAME
link_extract.pl
=head1 SYNOPSIS
link_extract.pl --uri=http://www.perl.org/books/beginning-perl/ -e=pdf
Options:
--uri|u source uri (file:///foo.pdf, http://foo.com/
bar.html)
supported schemes: http https ftp file news
gopher
--extension|ext|e file extension of links to be extracted
-help brief help message
-man full documentation
=head1 OPTIONS
=over 4
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=back
=head1 DESCRIPTION
B<link_extractor.pl> will fetch and parse the content of the given --
uri and
print a list of all links matching the supplied file --extension.
On Aug 6, 2006, at 1:14 PM, djgoku wrote:
> #!/usr/bin/perl
> #
> # My second try, for get_http.pl
> # Syntax: get_http.pl filename (pdf|html|tar|etc)
> # Todo: Add use for web links (http://blah.com/blah
>
> use strict;
> use warnings;
>
> use HTML::SimpleLinkExtor;
>
> my $extor = HTML::SimpleLinkExtor->new();
>
> # Filename Stuff
> my $source = shift @ARGV;
> $extor->parse_file($source);
> my @links = $extor->links;
>
> # Filetype Stuff
> my $filetype = '*';
> $filetype = shift @ARGV if (@ARGV);
>
> # Print only found $filetype
> foreach (@links) {
> print "$_\n" if (m{\s*\.$filetype}i);
> }
> _______________________________________________
> kc mailing list
> kc at pm.org
> http://mail.pm.org/mailman/listinfo/kc
>
More information about the kc
mailing list