use LWP::UserAgent; use LWP::Simple; use HTML::LinkExtor; use URI::URL; #use strict; #use warnings; $threshhold = 100; $target_directory = "bruted_needles"; sub get_filename { my ($fn) = @_; # Remove directories $fn =~ s/.*\///g; # And sanitize $file =~ s/([^a-zA-Z0-9\.\-\+\~_%])/sprintf "\\x%02x", ord($1)/seg; # We need to have a filename, so give it a default if it's empty if ($fn eq "") { $fn = "root"; } return $fn; } sub callback { my ($tag, %attr) = @_; if ($tag ne 'a') { return; } push(@links, values %attr); } sub get_search_links { my ($engine, $searchstring, $quote, $engine_distinguishes_pdfs) = @_; # Remove nonprintables because the search engine won't like them. $searchstring =~ s/[[:^print:]]//g; print "Search: $searchstring\n"; #$searchstring = "I know we aren't even into summer"; $searchstring =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; #%22 $searchstring = $quote . $searchstring . $quote; #my $url = "http://search.yahoo.com/search?vf=pdf&p=" . $searchstring; my $url = $engine . $searchstring; #my $user_agent = LWP::UserAgent->new(user-agent => "lwp-search"); # Arrange callback on a (link tag) @links = (); # Set up the parser my $parse = HTML::LinkExtor -> new(\&callback); # Ask for the document, then parse it when it gets here #my $newhtreq = HTTP::Request->new(GET => $url); #my $result = $user_agent->request($newhtreq, sub {$parse->parse($_[0])}); # Ask for the document and parse it; also find out how many search # results we got. If there are too many, don't bother adding the links. # (Yahoo specific!) my $content = get($url); my @searchresults = grep(//, grep(/Search Results/, split(/\n/, $content))); $searchresults[0] =~ s/for.*//g; $searchresults[0] =~ s/.* of//g; $searchresults[0] =~ s/[[:^digit:]]//g; if ($searchresults[0] eq "") { $searchresults[0] = 0; } print $searchresults[0] . " results. "; my $result = $parse->parse($content); # If it's greater than 100, fuggedaboutit, it's too broad. # (Feel free to tweak this number - it's defined in the beginning.) my $numresults; if ($threshhold >= $searchresults[0]) { print "continuing.\n"; # Only add if < threshhold #Remove irrelevant links my @results = grep(/:\/\//, grep(!/(yahoo|search\/cache|yimg|seobook|myriadsearch|www\.google\.com|\/ask\.com|search\.msn\.com|\/web.\.ask\.com|\/web\.ask\.com)/, @links)); #Only accept pdfs - if the engine can't discriminate #between pdfs and other data, we do by the filename. if ($engine_distinguishes_pdfs == "") { @results = grep(/pdf/i, @results); } $numresults = $#results + 1; print join("\n", @results), "\n"; print "\n"; push(@all_links, @results); } else { print "too many, skipping this one.\n"; return -1; } return ($numresults); } sub download { my ($source, $dirname, $fn) = @_; # Open the file (or die), get the data from $source, and output it! # Maybe use ->get with content_cb and read_size_hint to manage truly # large files without memory troubles. For now, we don't, though. # ( http://www.stonehenge.com/merlyn/LinuxMag/col83.html ) my $path = $dirname . "/" . $fn; if (-e $path) { print "File already exists, skipping!"; return; } open(my $outhandle, ">$path") || die "Cannot open $path for writing!"; binmode $outhandle; my $user_agent = LWP::UserAgent->new; my $output = 0, $count = 0; $user_agent->get ( $source, ":content_cb" => sub { my ($chunk, $response, $protocol) = @_; my $maxlen = $response->content_length; $count += length $chunk; print {$outhandle} $chunk; my $perc = ($count / $maxlen) * 100; print "Downloaded $count out of $maxlen bytes so far, $perc % \r"; } ); #print {$outhandle} get($source); close($outhandle); } #my $fn = "dolotai.txt"; if ($#ARGV < 0) { die "Usage: brute_download.pl [title list file]\n"; } my $fn = $ARGV[0]; open (LINKFILE, $fn) || die "Cannot open link file!\n"; while ($searchstring = ) { my $useful_links_acquired = get_search_links( "http://search.yahoo.com/search?vf=pdf&p=", $searchstring, "%22", "true"); # If we didn't find anything and the string is long, try chopping a # word off the end. # I really should roll up this loop, but not enough time.. print "ULA: $useful_links_acquired \n"; my $where = 0; if ($useful_links_acquired == 0) { $useful_links_acquired = get_search_links("http://www.myriadsearch.com/myriad.php?query=", $searchstring, "%22", ""); } while ($useful_links_acquired == 0 && $searchstring ne "" && $where != -1) { $where = rindex($searchstring, " "); print "Where are you now: $where \n"; if ($where != -1) { my $length = length($searchstring); $searchstring = substr $searchstring, 0, $where; } print $searchstring . "\n"; $useful_links_acquired = get_search_links("http://search.yahoo.com/search?vf=pdf&p=", $searchstring, "%22", "true"); # If that doesn't work, try a search engine that covers more # space, but doesn't discriminate between ordinary and PDF # files. if ($useful_links_acquired == 0) { $useful_links_acquired = get_search_links("http://www.myriadsearch.com/myriad.php?query=", $searchstring, "%22", ""); } } } # Now sort the links and make them unique @sorted_all_links = sort { $a cmp $b } @all_links; $previous_entry = "no previous entry yet"; @out_all_links = grep($_ ne $previous_entry && (($previous_entry) = $_), @sorted_all_links); print "\n\n\n\n---\n"; print join ("\n", @out_all_links), "\n"; #Make directory (just to be sure) and download all the links into it. mkdir("$target_directory", 0755); foreach $link (@out_all_links) { my $filename = get_filename($link); print "Downloading $link into $target_directory / " . $filename . "\n"; download($link, $target_directory, $filename); }