[Kc] HTTP Links

Garrett Goebel ggoebel at goebel.ws
Mon Aug 7 09:32:53 PDT 2006


Here's a third try for you using:
o  Getopt::Long for command line option processing
o  LWP::UserAgent to fetch content from uri's instead of a file
o  Documentation


#!/usr/bin/perl
use strict;
use warnings;
use version; our $VERSION = qv(0.0.3);
use Getopt::Long;
use Pod::Usage;
use Regexp::Common qw(URI);
use LWP::UserAgent;
use HTML::SimpleLinkExtor;

# Specify and process command line options
my %Config;
GetOptions(
     \%Config,
     'uri|u=s',
     'extension|ext|e=s',
     'help|h|?',
     'man|m') or pod2usage(2);
pod2usage(1)                                if $Config{help};
pod2usage(-exitstatus => 0, -verbose => 2)  if $Config{man};
pod2usage('--uri is required!')             if !$Config{uri};

# Check that uri schema is supported by HTTP::Request for GET requests
my $valid_uri = 0;
for my $scheme (qw(HTTP FTP file NNTP gopher)) {
     $valid_uri++ if $Config{uri} =~ /$RE{URI}{$scheme}/;
}
$valid_uri++ if $Config{uri} =~ /$RE{URI}{HTTP}{-scheme => 'https'}/;
pod2usage("invalid or unsupported uri: $Config{uri}") if !$valid_uri;

$Config{extension} ||= '*';

# Fetch content from uri and print extracted links
my $ua       = LWP::UserAgent->new;
my $request  = HTTP::Request->new(GET => $Config{uri});
my $response = $ua->request($request);
if (!$response->is_success) {
     die("Unable to process request for uri: $Config{uri}:\n",
         $response->status_line);
}
my $content  = $response->content;

my $extor = HTML::SimpleLinkExtor->new;
$extor->parse($content);
foreach my $link ($extor->links) {
     next if $link !~ m/\.$Config{extension}/ixms;
     print "$link\n";
}

1;

__END__

=head1 NAME

link_extract.pl

=head1 SYNOPSIS

link_extract.pl --uri=http://www.perl.org/books/beginning-perl/ -e=pdf

Options:
      --uri|u            source uri (file:///foo.pdf, http://foo.com/ 
bar.html)
                         supported schemes: http https ftp file news  
gopher
      --extension|ext|e  file extension of links to be extracted
      -help              brief help message
      -man               full documentation

=head1 OPTIONS

=over 4

=item B<-help>

Print a brief help message and exits.

=item B<-man>

Prints the manual page and exits.

=back

=head1 DESCRIPTION

B<link_extractor.pl> will fetch and parse the content of the given -- 
uri and
print a list of all links matching the supplied file --extension.



On Aug 6, 2006, at 1:14 PM, djgoku wrote:

> #!/usr/bin/perl
> #
> # My second try, for get_http.pl
> # Syntax: get_http.pl filename (pdf|html|tar|etc)
> # Todo: Add use for web links (http://blah.com/blah
>
> use strict;
> use warnings;
>
> use HTML::SimpleLinkExtor;
>
> my $extor = HTML::SimpleLinkExtor->new();
>
> # Filename Stuff
> my $source = shift @ARGV;
> $extor->parse_file($source);
> my @links = $extor->links;
>
> # Filetype Stuff
> my $filetype = '*';
> $filetype = shift @ARGV if (@ARGV);
>
> # Print only found $filetype
> foreach (@links) {
> 	print "$_\n" if (m{\s*\.$filetype}i);
> }
> _______________________________________________
> kc mailing list
> kc at pm.org
> http://mail.pm.org/mailman/listinfo/kc
>



More information about the kc mailing list