# Extract \Title entries from PDFs embedded in the challenge file # Fairly simple and ad hoc (based on PDF files), but the mold should be # obvious: get some unique identifier and search for it. Here we get it; the # other program searches for it, and then the directory intersect program throws # away the false positives. #my $fn = "e76.pdf"; if ($#ARGV < 0) { die "Usage: extract_titles.pl [file to extract PDF titles from]\n"; } my $fn = $ARGV[0]; print "\n"; open (CHALLFILE, $fn) || die "Can't open file!"; while ($potential = ) { @divided_titles = grep(/\/Title/, $potential); @potential_titles = split(/\//, join("\n", @divided_titles)); @twice_titles = grep(/Title/, @potential_titles); #@divided_titles = grep(/\/Title/, $potential_titles); # @titles_this = grep(/Title/, split(/\//, grep(/\/Title/, $potential))); push (@titles, @twice_titles); # Split from \Title to endline } # Clean up foreach $fh (@titles) { # Remove binary stuff before the title $fh =~ s/.*Title/Title/g; # Remove other elements after $fh =~ s/\).*/\)/g; # Be sure control chars are gone $fh =~ s/\\[[:cntrl:]]//g; $fh =~ s/[[:cntrl:]]//g; #Unescape octal $fh =~ s/\\(0[0-7]+)/chr(oct($1))/esg; #Prune unicode $fh =~ s/\000//g; #And finally remove the title elements. $fh =~s/^Title\(//g; $fh =~s/^Title \(//g; $fh =~s/\)$//g; } print join("\n", @titles), "\n"; close(CHALLFILE);