use LWP::UserAgent;
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;
#use strict;
#use warnings;
$threshhold = 100;
$target_directory = "bruted_needles";
sub get_filename {
my ($fn) = @_;
# Remove directories
$fn =~ s/.*\///g;
# And sanitize
$file =~ s/([^a-zA-Z0-9\.\-\+\~_%])/sprintf "\\x%02x", ord($1)/seg;
# We need to have a filename, so give it a default if it's empty
if ($fn eq "") {
$fn = "root";
}
return $fn;
}
sub callback {
my ($tag, %attr) = @_;
if ($tag ne 'a') {
return;
}
push(@links, values %attr);
}
sub get_search_links {
my ($engine, $searchstring, $quote, $engine_distinguishes_pdfs) = @_;
# Remove nonprintables because the search engine won't like them.
$searchstring =~ s/[[:^print:]]//g;
print "Search: $searchstring\n";
#$searchstring = "I know we aren't even into summer";
$searchstring =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
#%22
$searchstring = $quote . $searchstring . $quote;
#my $url = "http://search.yahoo.com/search?vf=pdf&p=" . $searchstring;
my $url = $engine . $searchstring;
#my $user_agent = LWP::UserAgent->new(user-agent => "lwp-search");
# Arrange callback on a (link tag)
@links = ();
# Set up the parser
my $parse = HTML::LinkExtor -> new(\&callback);
# Ask for the document, then parse it when it gets here
#my $newhtreq = HTTP::Request->new(GET => $url);
#my $result = $user_agent->request($newhtreq, sub {$parse->parse($_[0])});
# Ask for the document and parse it; also find out how many search
# results we got. If there are too many, don't bother adding the links.
# (Yahoo specific!)
my $content = get($url);
my @searchresults = grep(//, grep(/Search Results/, split(/\n/,
$content)));
$searchresults[0] =~ s/for.*//g;
$searchresults[0] =~ s/.* of//g;
$searchresults[0] =~ s/[[:^digit:]]//g;
if ($searchresults[0] eq "") {
$searchresults[0] = 0;
}
print $searchresults[0] . " results. ";
my $result = $parse->parse($content);
# If it's greater than 100, fuggedaboutit, it's too broad.
# (Feel free to tweak this number - it's defined in the beginning.)
my $numresults;
if ($threshhold >= $searchresults[0]) {
print "continuing.\n";
# Only add if < threshhold
#Remove irrelevant links
my @results = grep(/:\/\//, grep(!/(yahoo|search\/cache|yimg|seobook|myriadsearch|www\.google\.com|\/ask\.com|search\.msn\.com|\/web.\.ask\.com|\/web\.ask\.com)/, @links));
#Only accept pdfs - if the engine can't discriminate
#between pdfs and other data, we do by the filename.
if ($engine_distinguishes_pdfs == "") {
@results = grep(/pdf/i, @results);
}
$numresults = $#results + 1;
print join("\n", @results), "\n";
print "\n";
push(@all_links, @results);
} else {
print "too many, skipping this one.\n";
return -1;
}
return ($numresults);
}
sub download {
my ($source, $dirname, $fn) = @_;
# Open the file (or die), get the data from $source, and output it!
# Maybe use ->get with content_cb and read_size_hint to manage truly
# large files without memory troubles. For now, we don't, though.
# ( http://www.stonehenge.com/merlyn/LinuxMag/col83.html )
my $path = $dirname . "/" . $fn;
if (-e $path) {
print "File already exists, skipping!";
return;
}
open(my $outhandle, ">$path") || die "Cannot open $path for writing!";
binmode $outhandle;
my $user_agent = LWP::UserAgent->new;
my $output = 0, $count = 0;
$user_agent->get ( $source,
":content_cb" => sub {
my ($chunk, $response, $protocol) = @_;
my $maxlen = $response->content_length;
$count += length $chunk;
print {$outhandle} $chunk;
my $perc = ($count / $maxlen) * 100;
print "Downloaded $count out of $maxlen bytes so far, $perc % \r";
}
);
#print {$outhandle} get($source);
close($outhandle);
}
#my $fn = "dolotai.txt";
if ($#ARGV < 0) {
die "Usage: brute_download.pl [title list file]\n";
}
my $fn = $ARGV[0];
open (LINKFILE, $fn) || die "Cannot open link file!\n";
while ($searchstring = ) {
my $useful_links_acquired = get_search_links(
"http://search.yahoo.com/search?vf=pdf&p=", $searchstring, "%22",
"true");
# If we didn't find anything and the string is long, try chopping a
# word off the end.
# I really should roll up this loop, but not enough time..
print "ULA: $useful_links_acquired \n";
my $where = 0;
if ($useful_links_acquired == 0) {
$useful_links_acquired = get_search_links("http://www.myriadsearch.com/myriad.php?query=", $searchstring, "%22", "");
}
while ($useful_links_acquired == 0 && $searchstring ne "" &&
$where != -1) {
$where = rindex($searchstring, " ");
print "Where are you now: $where \n";
if ($where != -1) {
my $length = length($searchstring);
$searchstring = substr $searchstring, 0, $where;
}
print $searchstring . "\n";
$useful_links_acquired = get_search_links("http://search.yahoo.com/search?vf=pdf&p=", $searchstring, "%22", "true");
# If that doesn't work, try a search engine that covers more
# space, but doesn't discriminate between ordinary and PDF
# files.
if ($useful_links_acquired == 0) {
$useful_links_acquired = get_search_links("http://www.myriadsearch.com/myriad.php?query=", $searchstring, "%22", "");
}
}
}
# Now sort the links and make them unique
@sorted_all_links = sort { $a cmp $b } @all_links;
$previous_entry = "no previous entry yet";
@out_all_links = grep($_ ne $previous_entry && (($previous_entry) = $_),
@sorted_all_links);
print "\n\n\n\n---\n";
print join ("\n", @out_all_links), "\n";
#Make directory (just to be sure) and download all the links into it.
mkdir("$target_directory", 0755);
foreach $link (@out_all_links) {
my $filename = get_filename($link);
print "Downloading $link into $target_directory / " . $filename . "\n";
download($link, $target_directory, $filename);
}