#!perl -w #----------------------------------------------------------------------------- # BloodHound.pl # # Part of the Data Carving Utility Library (DCUL). # http://www.sftsrc.com/DCUL/ #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Copyright (c) 2006, SoftSource Consulting # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation # and/or other materials provided with the distribution. # * Neither the name of SoftSource Consulting nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Version history: # # Version 1.0, 11th July 2006: # Initial release: John Goalby, john.goalby@sftsrc.com #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Description: # # Bloodhound goes through an input file such as a disk image searching for # known files. These known files can be anything as long as they are at least # as large as the blocksize specified on the command line. # # Bloodhound reads the first blocksize bytes of the files to search for. # This gives a strong indication that the file is at that byte offset in the # input file. It does not guarantee it though. Issues with data # fragmentation make solving this problem efficiently very difficult. # # We don't just look for the files at block boundaries. This means we will # catch attempts at hiding files as well as embedded files such as # attachments. # # We create a regular expression to quickly search for potential hits. This # means that we don't have to create an MD5 hash for every single sequence of # blocksize bytes which would be very expensive. # # There is a potential issue with files that are fragmented after the very # first block. If it is fragmented this way and the file does not begin at the # first byte of the block we will not find it. I feel that this problem is # of minor importance to solve given the added complexity that its solution # would entail. # # Examples of use: # # Quickly searching a drive image for trade secrets, such as source code. # Quickly searching a drive image for images in a criminal investigation. # # Design: # # This script was designed for submission to the DFRWS data carving challenge # (http://www.dfrws.org/2006/challenge/). As such, the challenge was to # examine a 50MB disk image. This is obviously very small compared to disk # images in real use. If you need this functionality on a bigger scale please # contact us for information on our enterprise version. #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Import statements. #----------------------------------------------------------------------------- use strict; use IO::File; use Getopt::Long; use File::Basename; use File::Find; use Digest::MD5 qw (md5 md5_hex); use POSIX qw (strftime); use Time::Local; use Cwd; #----------------------------------------------------------------------------- # Local imports. #----------------------------------------------------------------------------- use Helpers::DCULHelper; #----------------------------------------------------------------------------- # Various constants. #----------------------------------------------------------------------------- use constant VERSION_NUMBER => "1.0"; use constant MAXALTERNATES => 10; use constant DEFAULT_BLOCK_SIZE => 512; #----------------------------------------------------------------------------- # Capture the name of this script for use later on. #----------------------------------------------------------------------------- my $g_thisscript = $0; my $g_thisscriptbasename = basename ($g_thisscript); #----------------------------------------------------------------------------- # Globals #----------------------------------------------------------------------------- my %g_md5hashes; my @g_filestoprocess; my $g_regexp = ""; my @g_rearr; my $g_regexplen = 0; my $g_pdffilename = ""; #----------------------------------------------------------------------------- # Command line option globals. #----------------------------------------------------------------------------- my %g_options; my %g_optionhelp; #----------------------------------------------------------------------------- # Invocation related globals. #----------------------------------------------------------------------------- my $g_starttime; my $g_starttimetext; my @g_scriptlines; #----------------------------------------------------------------------------- # Global counters. #----------------------------------------------------------------------------- my $g_matchesmade = 0; my $g_totalbytesread = 0; #----------------------------------------------------------------------------- # Capture the command line parameters. Need to do this before we process them # with GetOptions as that seems to remove them. Also get the number of params # that were passed in. #----------------------------------------------------------------------------- my $g_cmdline = $g_thisscript . DCULHelper::GetCommandLineParams(); my $g_paramspassed = scalar @ARGV; #----------------------------------------------------------------------------- # Globals for the command line options. #----------------------------------------------------------------------------- # Housekeeping my ($gopt_help, $gopt_version); # Various input. my ($gopt_blocksize, $gopt_file, $gopt_searchfile); # PDF processing my ($gopt_outputpdf, $gopt_reporttitle); my ($gopt_subtitle, $gopt_author); #----------------------------------------------------------------------------- # Process the command line arguments. #----------------------------------------------------------------------------- InitializeOptionInfo(); GetOptions (%g_options); #----------------------------------------------------------------------------- # Setup defaults for command line params that were not specified by the user. #----------------------------------------------------------------------------- $gopt_author = "anonymous" unless defined $gopt_author; $gopt_reporttitle = "Untitled" unless defined $gopt_reporttitle; $gopt_outputpdf = 1 unless defined $gopt_outputpdf; $gopt_blocksize = DEFAULT_BLOCK_SIZE unless defined $gopt_blocksize; #----------------------------------------------------------------------------- # If the user just wants version or help information then output that now. #----------------------------------------------------------------------------- PrintUsage() && exit if ($gopt_help || ($g_paramspassed eq 0)); PrintVersion() && exit if $gopt_version; #----------------------------------------------------------------------------- # Check that the parameters are sufficient for our needs. #----------------------------------------------------------------------------- die "Must specify an input file.\n" unless defined ($gopt_file); die "$gopt_file must be a file.\n" unless (-f $gopt_file); die "Must specify a search file.\n" unless defined ($gopt_searchfile); die "Search Input file ($gopt_searchfile) must exist\n" if ((defined $gopt_searchfile) && (! ((-f $gopt_searchfile) || (-d $gopt_searchfile)))); die "Block size must be >= 32 and <= 32768.\n" unless (($gopt_blocksize >= 32) && ($gopt_blocksize <= 32768)); #----------------------------------------------------------------------------- # Do it! #----------------------------------------------------------------------------- # Take care of getting things initialized. StartUp(); # Output information about this script and environment. PrintScriptInformation(); PrintEnvironmentInformation(); # Capture the user input on which files to search for. CaptureInputFiles(); # Pre-Populate the array. PrePopulateRegularExpressionArray(); # Add the MD5 hashes of the input files to the internal hash. AddInputFilesToMD5Hash(); # Determine a regular expression length based on the data we just read. DetermineRegularExpressionLength(); # Generate the actual regular expression we will use. GenerateRegularExpression(); # Output the information about the invocation. PrintInvocationInformation(); # Process the passed in input file. ProcessFile(); # Clean everything up. ShutDown(); #----------------------------------------------------------------------------- # Initialize the information for the command line options. #----------------------------------------------------------------------------- sub InitializeOptionInfo { $g_options{"version|v"} = \$gopt_version; $g_optionhelp{"version"} = "Output version information and exit."; $g_options{"help|?"} = \$gopt_help; $g_optionhelp{"help"} = "Display this help and exit."; $g_options{"author|a:s"} = \$gopt_author; $g_optionhelp{"author"} = "The author for the PDF report."; $g_options{"outputpdf|pdf!"} = \$gopt_outputpdf; $g_optionhelp{"outputpdf"} = "Specify whether to output to PDF in ". "addition to console. Default is to output to both."; $g_options{"title|t:s"} = \$gopt_reporttitle; $g_optionhelp{"title"} = "The title for the PDF report."; $g_options{"subtitle|u:s"} = \$gopt_subtitle; $g_optionhelp{"subtitle"} = "The subtitle for the PDF report."; $g_options{"file|f=s"} = \$gopt_file; $g_optionhelp{"file"} = "Specify the input file to process."; $g_options{"searchfiles|s=s"} = \$gopt_searchfile; $g_optionhelp{"searchfiles"} = "Specify file to search for or a ". "directory of files to search for. The script will use all ". "files in subdirectories."; $g_options{"blocksize|bs:i"} = \$gopt_blocksize; $g_optionhelp{"blocksize"} = "The blocksize to use for matching. " . "Default is ".DEFAULT_BLOCK_SIZE.". If using a disk image as the ". "input file it is best to use the sector size."; } #----------------------------------------------------------------------------- # Capture the input files specifed by the user. Either a directory that # contains files or a single file. #----------------------------------------------------------------------------- sub CaptureInputFiles { # If the user specified a directory. if (-d $gopt_searchfile) { # Process all files in the search directory. find ({ wanted => \&ProcessSearchDirectory, no_chdir => 1 }, $gopt_searchfile); } else { # Just add the one to the array. push (@g_filestoprocess, $gopt_searchfile); } } #----------------------------------------------------------------------------- # Process the passed in file. We are trying to match a string against the # block size. We read in a block size at a time, so, to match properly we # also need to have the next block loaded too. We only match sequences that # start in the first buffer. This is shown below. # # <------- match --------> # <------- prevbuffer --------><------- buffer --------> # # We capture how much was read so that we can terminate correctly. We need # to terminate when the prevbuffer read is 0 as we still want to search the # very last buffer, not just the last but one buffer. This is shown below # with \0 indicating eof. # # <------- match --------> # <------- prevbuffer -------->\0 # #----------------------------------------------------------------------------- sub ProcessFile { # Details page DCULHelper::PrintPageTitle ("Details"); # Open the input file in binary mode. my $fh = IO::File->new ($gopt_file); die "Error reading $gopt_file\n" if !defined ($fh); binmode ($fh); # Holder for this buffer and the previous one. my $buffer; my $prevbuffer; # Holder for how much was read in each buffer. my $bufferread = -1; my $prevbufferread = -1; # Holder for the concatenated buffers. my $fullbuf; # Read in a block size up front into the prevbuffer. $bufferread = $fh->sysread ($prevbuffer, $gopt_blocksize); # Use this to keep track of how much was read. $g_totalbytesread += $bufferread; # Starting off at 0. my $offset = 0; # While we are still reading the file. while (1) { # Read the next buffer. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Use this to keep track of how much was read. $g_totalbytesread += $bufferread; # Add the newly read buffer onto the end of the previous. $fullbuf = $prevbuffer.$buffer; # While there are more matches. while ($fullbuf =~ /$g_regexp/gos) { # If we found a match and we are still in the prev buffer. if ((length ($&) > 0) && (length ($`) < $gopt_blocksize)) { # Get the length of the pre-match string. my $lenpre = length ($`); # MD5 hash the found string. my $md5 = md5_hex ($&); # If the MD5 hash was found if (exists $g_md5hashes{$md5}) { # Calculate the block and offset from the # block if appropriate. my $blocknum = int(($offset + $lenpre) / $gopt_blocksize); my $blocknumrem = ($offset + $lenpre) % $gopt_blocksize; # Output the match information, including the file. DCULHelper::PrintLn ("Byte offset: ".($offset + $lenpre). " Block: $blocknum, Offset: $blocknumrem"); DCULHelper::PrintLn ("\t".$g_md5hashes{$md5}); # Increment the match counter. $g_matchesmade++; } } } # The previous buffer is now the current buffer. $prevbuffer = $buffer; $prevbufferread = $bufferread; # Next please. $offset += $gopt_blocksize; # If the prev buffer was a read of nothing. if ($prevbufferread eq 0) { # End the loop. last; } } # Don't forget to close the input file. close ($fh); } #----------------------------------------------------------------------------- # Do things that need to happen at the start of the script. #----------------------------------------------------------------------------- sub StartUp { # Change the slashes in search file to all be the same direction. $gopt_searchfile =~ s/\\/\//g; # Get the time that we started doing the actual processing. $g_starttime = time(); $g_starttimetext = DCULHelper::FormatDateTime ($g_starttime); # If the user wants PDF output. if ($gopt_outputpdf) { # Start with the script filename. $g_pdffilename = $g_thisscriptbasename; # Remove the extension. $g_pdffilename =~ s/\..*$/ /; # Add the time and date. $g_pdffilename .= $g_starttimetext; # Remove spaces and :'s. $g_pdffilename =~ tr/ \t\:/_/; # Add the extension. $g_pdffilename .= ".pdf"; # Initialize the PDF helper. PDFHelper::StartUp ($g_pdffilename, $gopt_author, "$g_thisscriptbasename Report", $gopt_subtitle, $gopt_reporttitle); } } #----------------------------------------------------------------------------- # Do things that need to happen at the end of the script. #----------------------------------------------------------------------------- sub ShutDown { # Output a new page title for the summary. DCULHelper::PrintPageTitle ("Summary"); # Get the end time. my $endtime = time(); # Output the time taken. my $totaltime = $endtime - $g_starttime; DCULHelper::PrintLn ("Total time taken: $totaltime second(s)"); # Output the number of matches. DCULHelper::PrintLn ("Total number of matches: $g_matchesmade"); # Output the total bytes read from the input file. DCULHelper::PrintLn ("Input file total bytes read: $g_totalbytesread"); # If the user wanted PDF output. if ($gopt_outputpdf) { # Output the source for the script. Don't need to do this for the # console output as the user could output it whenever they wanted. DCULHelper::OutputScriptSource (\@g_scriptlines); # Cleanup the PDF helper. PDFHelper::ShutDown();; } } #----------------------------------------------------------------------------- # Print out useful information about this script. #----------------------------------------------------------------------------- sub PrintScriptInformation { # Script information page. DCULHelper::PrintPageTitle ("Script Information"); # Output the script version number. DCULHelper::PrintLn ("$g_thisscriptbasename version number: ". VERSION_NUMBER); # Read in this script so can create MD5 hash and output the # source if we wanted. open (SCRIPTFILE, $g_thisscript) or warn "Cannot open file ". $g_thisscript, $!; binmode (SCRIPTFILE); @g_scriptlines = ; close (SCRIPTFILE); # Put the script into a handy string my $scriptsource = join "", @g_scriptlines; # Calculate the script's MD5 hash and output. DCULHelper::PrintLn ("$g_thisscriptbasename MD5: ". md5_hex ($scriptsource)); # Output the last modified, created and size information for the script. DCULHelper::PrintLastModCreatedSizeForFile ($g_thisscript); } #----------------------------------------------------------------------------- # Print out useful information about this invocation. #----------------------------------------------------------------------------- sub PrintInvocationInformation { # Invocation information page. DCULHelper::PrintPageTitle ("Invocation"); # Output the invocation information. DCULHelper::PrintLn ("Started: $g_starttimetext"); DCULHelper::PrintLn ("Command line: $g_cmdline"); DCULHelper::PrintLn ("Current working directory: ".cwd()); DCULHelper::PrintLn ("Block size: $gopt_blocksize bytes"); DCULHelper::PrintLn ("Read ".scalar @g_filestoprocess. " files from $gopt_searchfile"); DCULHelper::PrintLn ("Regular expression: $g_regexp"); # Output the file that PDF output is being output to should # it be actually being output. DCULHelper::PrintLn ("PDF Output: $g_pdffilename") if $gopt_outputpdf; # Output the input file's MD5 hash. DCULHelper::PrintMD5HashofInputFile ($gopt_file); # Output the last modified, created and size information # for the input file. DCULHelper::PrintLastModCreatedSizeForFile ($gopt_file); } #----------------------------------------------------------------------------- # Print out useful information about the environment. This information is # readily available from the command line so is only an option for outputting # to the PDF file. #----------------------------------------------------------------------------- sub PrintEnvironmentInformation { # If the user requested PDF output. We don't want to output this # information to the console as it is accessible to the user any # time they want it. if ($gopt_outputpdf) { # Output the environment information. DCULHelper::PrintEnvironmentInformation(); } } #----------------------------------------------------------------------------- # Process each found entry in the search directory. #----------------------------------------------------------------------------- sub ProcessSearchDirectory { # Add the found file to the list of files to process. push (@g_filestoprocess, $File::Find::name) if (-f $File::Find::name); } #----------------------------------------------------------------------------- # Fill in blank hashes for the regular expression array based on the maximum # possible length we can use (block size). We will later use a smaller length # based on the number of alternates for a particular byte position. #----------------------------------------------------------------------------- sub PrePopulateRegularExpressionArray { # Use the block size and initialize the array. for (my $i1 = 0; $i1 < $gopt_blocksize; $i1++) { # Make space for the entries. push (@g_rearr, {}); } } #----------------------------------------------------------------------------- # Determine as optimal a regular expression length as we can. This RE is # going to be used as an initial match to save having to MD5 hash every single # block of blocksize bytes (numbytestotal-blocksize times!). # # We start off with a minimum length so we at least utilize the RE. # # We cut off increasing the length of the RE when the number of files that # have unique entries is greater than half. #----------------------------------------------------------------------------- sub DetermineRegularExpressionLength { # We use this number as the max count of files that can have different # values before we stop. my $maxfilecount = ((scalar @g_filestoprocess) / 2); # Default the regular expression length to the maximum until # we determine otherwise. $g_regexplen = scalar @g_rearr; # Start the loop partway through so that we at least have some bytes # we can compare. If we didn't do this then we could end up with a # degenerate case that would slow things down enormously. for (my $i = 10; $i <= $#g_rearr; $i++) { # Get the count of matches for this byte position. my $hashref = $g_rearr[$i]; my %hashderef = %$hashref; my $matchcount = keys %hashderef; # If the count of unique entries is more than we can abide. if ($matchcount > $maxfilecount) { # Set the regular expression length and be done. $g_regexplen = $i - 1; last; } } } #----------------------------------------------------------------------------- # Create the regular expression from the length we determined previously. # This will create a nice string that we can later use to match the data as # we read it from the input file. #----------------------------------------------------------------------------- sub GenerateRegularExpression { # Holder for the number of unique byte values per byte position. my $uniquecount = 0; # For all of the byte positions we care about. for (my $i = 0; $i < $g_regexplen; $i++) { # Get the count of matches for this byte position. my $hashref = $g_rearr[$i]; my %hashderef = %$hashref; $uniquecount = keys %hashderef; # If the unique count is more than the max number of alternatives. if ($uniquecount > MAXALTERNATES) { # Just match anything rather than each of # the separate alternatives. $g_regexp .= "."; } else { # If the number of unique entries is > 1. if ($uniquecount > 1) { # Start with a ( for containing the alternates. $g_regexp .= "("; # For all of the possible byte values. foreach my $re1 (keys %hashderef) { # Add them as alternates in the regular expression. $g_regexp .= "\\x$re1|"; } # Remove the trailing | symbol. $g_regexp =~ s/\|$/)/; } elsif ($uniquecount eq 1) { # We just need to get the first entry my ($key, $value) = each (%hashderef); $g_regexp .= "(\\x$key)"; } else { # This really should never happen but best to flag # it if it does. die "The unique count should never be 0\n"; } } } # Pad the rest of the regular expression to fill up the block size. $g_regexp .= ".{".($gopt_blocksize-$g_regexplen)."}"; } #----------------------------------------------------------------------------- # Read in all of the input files that we are going to be searching for in the # image file. In addition to getting the MD5 hash of the first block of the # file, we also build up the regular expression array so that we can do faster # searches later. #----------------------------------------------------------------------------- sub AddInputFilesToMD5Hash { # Input file MD5 page. PDFHelper::PrintPageTitlePDF ("Search Files"); # Buffer for holding the contents of files. my $buffer = ' ' x $gopt_blocksize; # For each of the files we are going to process. foreach my $fileprocess (@g_filestoprocess) { # Open the file in binary mode. my $fhin = IO::File->new ($fileprocess); die "Error reading $fileprocess\n" if !defined ($fhin); binmode ($fhin); # Holder for how much data was actually read. my $dataread = 0; # Read in just a single block of data. We could go through the # entire file but that really increases how long it will take to # find matches. It also complicates where you find a block. We # went with finding the first block anywhere in an image. $dataread = $fhin->sysread ($buffer, $gopt_blocksize); # If we didn't read at least blocksize amount, let the user know. if ($dataread != $gopt_blocksize) { # Indicate that there is a slight issue PDFHelper::PrintLinePDF ("$fileprocess failed to read $gopt_blocksize bytes, ignoring."); } else { # Create an array for the number of regular expression bytes we # are going to be looking at. my @initialhex = unpack "H2" x (scalar @g_rearr), $buffer; # For each byte we are looking at. for (my $i = 0; $i <= $#g_rearr; $i++) { # Set the byte's hash to include this character. We # essentially build up the possible values for each byte # position in the hash. We use a hash so that we don't # have duplicates. $g_rearr[$i]{$initialhex[$i]} = 1; } # Create an MD5 hash of the buffer we read. my $tempmd5 = md5_hex ($buffer); # Output the MD5 hash information. PDFHelper::PrintLinePDF ($fileprocess); # There is a possibility that the hash already exists. if (exists $g_md5hashes{$tempmd5}) { # Add this file we processed to the list of files that match # this MD5 hash. $g_md5hashes{$tempmd5} = $g_md5hashes{$tempmd5}.";$fileprocess"; # Output the block 0 hash and indicate that # this is a duplicate. PDFHelper::PrintLinePDF (" block 0: ". $tempmd5." (duplicate)"); } else { # Set this file we processed as a file # that matches this MD5 hash. $g_md5hashes{$tempmd5} = $fileprocess; # Output the block 0 hash. PDFHelper::PrintLinePDF (" block 0: ".$tempmd5); } } # We no longer need the file, so close it. close ($fhin); } } #----------------------------------------------------------------------------- # Output the usage of this script. #----------------------------------------------------------------------------- sub PrintUsage { # Get the text for the required invocation. my $requiredinvocation = DCULHelper::GetRequiredOptionUsage (\%g_options); # Get the text for the options. my $commandlineoptions = DCULHelper::GetOptionUsage (\%g_options, \%g_optionhelp); print<< "ENDOFUSAGE"; Usage: $g_thisscriptbasename $requiredinvocation [OPTIONS] Search for the presence of files inside another file. Useful if you want to find specific files within a disk image file. This script quickly finds the first block of the specified input files within the input file. For more technical information please feel free to read the source code which has lots of comments. Command-line options: $commandlineoptions Examples: Look inside image.dat for all of the files located in the ..\\testfiles directory: $g_thisscriptbasename --file=image.dat --searchfiles=..\\testfiles Look inside image.dat for the single lookfor.jpg file: $g_thisscriptbasename --file=image.dat --searchfiles=lookfor.jpg Look inside image.dat for all of the files located in the \\images\\evidence directory using the short form of parameters: $g_thisscriptbasename --f=image.dat --s=\\images\\evidence ENDOFUSAGE } #----------------------------------------------------------------------------- # Output the version information for this script. #----------------------------------------------------------------------------- sub PrintVersion { print<< "ENDOFVERSION"; $g_thisscriptbasename @{[ VERSION_NUMBER ]} Written by John Goalby. Part of the Data Carving Utility Library (DCUL). http://www.sftsrc.com/DCUL/ Copyright (C) 2006 SoftSource Consulting. ENDOFVERSION } #----------------------------------------------------------------------------- # The end of the road. #-----------------------------------------------------------------------------