#!perl -w #----------------------------------------------------------------------------- # Ferret.pl # # Part of the Data Carving Utility Library (DCUL). # http://www.sftsrc.com/DCUL/ #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Copyright (c) 2006, SoftSource Consulting # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation # and/or other materials provided with the distribution. # * Neither the name of SoftSource Consulting nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Version history: # # Version 1.0, 17th July 2006: # Initial release: John Goalby, john.goalby@sftsrc.com #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Description: # # Ferret is a script that will try to determine the content of blocks of an # input file (generally a disk image) based on multiple passes. The script # tries to identify common file signatures and then builds on this information # by looking at the content of blocks. # # Finally, the script uses the information about the blocks to try and # determine complete files. This approach works well in an image of a # fragmented disk because no assumptions are made until we have as much # information as possible. # # The script also provides a summary output of its determination of each # block. This information was useful for debugging, but also could prove # useful to users looking to do manual carving in situations not covered # by the current set of data carving tools. # # Ferret also tries to do some recovery in the case of JPEGs when an end # of image is found at the start of a block. This was in the test image # for the challenge and caused the "saturn" image to be truncated. We # need more data to figure out how to handle this as a generality. # # Lastly, Ferret also does some simple recovery with ZIP files. It # looks to see if there is more data in the file than reported by the meta # information. If so, this script tried to find the combination of blocks # that satisifies the zip CRC value. # # Examples of use: # # To extract various files from a disk image. # To examine the contents of the disk image for manual carving. # # Design: # # This script was designed for submission to the DFRWS data carving challenge # (http://www.dfrws.org/2006/challenge/). As such, the challenge was to # examine a 50MB disk image. This is obviously very small compared to disk # images in real use. If you need this functionality on a bigger scale please # contact us for information on our enterprise version. #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # References: # # ZIP file format: # # http://www.pkware.com/business_and_developers/developer/popups/appnote.txt # http://www.bsdg.org/SWAG/ARCHIVES/0022.PAS.html # #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Notes. # # In the carving challenge, there is a picture of a hedgehog and a picture of # mars overlapping each other. I have not found a way to disentangle them in # code. It can obviously be done manually, but it would be nicer to do it in # code if there are many thousands of pictures in a drive image! # # I am keeping this note here as a reminder and placeholder for where the # pictures are located: # # The byte offset for the hedgehog picture is: 16115200 # The byte offset for the mars picture is: 16144896 # #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Todo. # # Should really change the block indicators from text to flags so that blocks # could be a combination of indicators such as "binary" and "not JPEG". Then # would need to change the extract code to check appropriately. #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # Import statements. #----------------------------------------------------------------------------- use strict; use IO::File; use Getopt::Long; use File::Basename; use Digest::MD5 qw (md5 md5_hex); use Compress::Zlib; use POSIX qw (strftime); use Time::Local; use Cwd; eval ("use PDF::Create"); #----------------------------------------------------------------------------- # Local imports. #----------------------------------------------------------------------------- use Helpers::DCULHelper; #----------------------------------------------------------------------------- # Block type constants. We use these so that the output of the blocks can be # somewhat readable by a human when you choose the outputblocks option. #----------------------------------------------------------------------------- use constant BLOCK_UNKNOWN => "--"; use constant BLOCK_CONTINUE_TEXT => "tt"; use constant BLOCK_TEXT => "|t"; use constant BLOCK_TEXT_END => "t|"; use constant BLOCK_POSSIBLE_HTML_START => "|h"; use constant BLOCK_POSSIBLE_HTML => "hh"; use constant BLOCK_POSSIBLE_HTML_END => "h|"; use constant BLOCK_BINARY => "bb"; use constant BLOCK_UNICODE_CONTINUE_TEXT => "uu"; use constant BLOCK_UNICODE_TEXT => "|u"; use constant BLOCK_POSSIBLE_JPEG_IMAGE => "jj"; use constant BLOCK_POSSIBLE_NOT_JPEG => "j*"; use constant BLOCK_POSSIBLE_JPEG_EOI => "j|"; use constant BLOCK_POSSIBLE_JPEG_EOI_SOB => "J|"; use constant BLOCK_POSSIBLE_JPEG_DATA => "jh"; use constant BLOCK_POSSIBLE_ZIP_DATA => "zz"; use constant BLOCK_POSSIBLE_ZIP_END => "z|"; use constant BLOCK_POSSIBLE_ZIP_DIR => "z."; use constant BLOCK_POSSIBLE_NOT_ZIP => "z*"; use constant BLOCK_POSSIBLE_DOC => "dd"; use constant BLOCK_POSSIBLE_END_DOC => "d|"; use constant BLOCK_PROCESSED => "__"; #----------------------------------------------------------------------------- # File signature constants. #----------------------------------------------------------------------------- use constant SIG_MATCH_SIZE => 4; use constant SIG_UNKNOWN => "--"; use constant SIG_JPEG => "JP"; use constant SIG_ZIP => "ZP"; use constant SIG_MSODOC => "DC"; use constant SIG_PNG => "PN"; use constant SIG_NULL => "00"; #----------------------------------------------------------------------------- # Signature to function lookup. #----------------------------------------------------------------------------- my %g_sig2fn = (SIG_JPEG() => \&ExtractJPEG, SIG_ZIP() => \&ExtractZIP, SIG_MSODOC() => \&ExtractMSODoc); #----------------------------------------------------------------------------- # State constants. #----------------------------------------------------------------------------- use constant STATE_START => 0; use constant STATE_START_SECTION => 1; use constant STATE_SECTION => 2; use constant STATE_SOS_SECTION => 3; use constant STATE_READ_DATA => 4; use constant STATE_START_HEADER => 5; use constant STATE_WAITING_FOR_DA => 6; use constant STATE_DONE => 99; #----------------------------------------------------------------------------- # Various constants. #----------------------------------------------------------------------------- use constant VERSION_NUMBER => "1.0"; use constant US_LETTER_PAGE_WIDTH => 612; use constant US_LETTER_PAGE_HEIGHT => 792; use constant PAGE_WIDTH => US_LETTER_PAGE_WIDTH; use constant PAGE_HEIGHT => US_LETTER_PAGE_HEIGHT; use constant PAGE_WIDTH_CENTER => PAGE_WIDTH / 2; use constant PAGE_HEIGHT_CENTER => PAGE_HEIGHT / 2; use constant PAGE_HEADER => 60; use constant PAGE_FOOTER => 60; use constant REPORT_TITLE_SIZE => 40; use constant REPORT_SUBTITLE_SIZE => 20; use constant PAGE_TITLE_SIZE => 20; use constant AUTHOR_SIZE => 20; use constant NORMAL_SIZE => 10; use constant NORMAL_LEFT => 50; use constant INDENT_LEFT => 80; use constant FIXED_SIZE => 8; use constant LINE_SPACING => 5; use constant TITLE_SPACING => 20; use constant START_POS => PAGE_HEIGHT - PAGE_HEADER; use constant LINE_INDENT => 4; use constant MAX_LINE_WIDTH => 80; #----------------------------------------------------------------------------- # Capture the name of this script for use later on. #----------------------------------------------------------------------------- my $g_thisscript = $0; my $g_thisscriptbasename = basename ($g_thisscript); #----------------------------------------------------------------------------- # Globals #----------------------------------------------------------------------------- my @g_blocks; my %g_dict; my $g_totalbytesread = 0; my $g_pdffilename = ""; my @g_extractstack; my %g_extractdetails; #----------------------------------------------------------------------------- # Command line option globals. #----------------------------------------------------------------------------- my %g_options; my %g_optionhelp; my %g_optionstypelookup = ( "s" => "STRING", "i" => "INTEGER", "f" => "FLOAT" ); #----------------------------------------------------------------------------- # Invocation related globals. #----------------------------------------------------------------------------- my $g_starttime; my $g_starttimetext; my @g_scriptlines; #----------------------------------------------------------------------------- # Capture the command line parameters. Need to do this before we process them # with GetOptions as that seems to remove them. Also get the number of params # that were passed in. #----------------------------------------------------------------------------- my $g_cmdline = $g_thisscript . DCULHelper::GetCommandLineParams(); my $g_paramspassed = scalar @ARGV; #----------------------------------------------------------------------------- # Global for file signature recognition. In a future version this will be # contained in a separate configuration file. For the challenge this was # the easier option, although it has the advantage that the data in this hash # is also output to the PDF as part of the script. #----------------------------------------------------------------------------- my %g_siglookup = ("\xFF\xD8\xFF\xE0" => SIG_JPEG, "\xFF\xD8\xFF\xE1" => SIG_JPEG, "PK\x03\x04" => SIG_ZIP, "\xD0\xCF\x11\xE0" => SIG_MSODOC, "\x00\x00\x00\x00" => SIG_NULL, "\x89PNG" => SIG_PNG); #----------------------------------------------------------------------------- # Globals for the command line options. #----------------------------------------------------------------------------- my ($gopt_help, $gopt_version, $gopt_blocksize, $gopt_file); my ($gopt_outputpdf, $gopt_reporttitle, $gopt_outdirectory); my ($gopt_subtitle, $gopt_author, $gopt_outputblocks); #----------------------------------------------------------------------------- # Process the command line arguments. #----------------------------------------------------------------------------- InitializeOptionInfo(); GetOptions (%g_options); #----------------------------------------------------------------------------- # Setup defaults for command line params that were not specified by the user. #----------------------------------------------------------------------------- $gopt_author = "anonymous" unless defined $gopt_author; $gopt_reporttitle = "Untitled" unless defined $gopt_reporttitle; $gopt_outputpdf = 1 unless defined $gopt_outputpdf; $gopt_blocksize = 512 unless defined $gopt_blocksize; $gopt_outdirectory = "." unless defined $gopt_outdirectory; $gopt_outputblocks = 0 unless defined $gopt_outputblocks; #----------------------------------------------------------------------------- # If the user just wants version or help information then output that now. #----------------------------------------------------------------------------- PrintUsage() && exit if ($gopt_help || ($g_paramspassed eq 0)); PrintVersion() && exit if $gopt_version; #----------------------------------------------------------------------------- # Check that the parameters are sufficient for our needs. #----------------------------------------------------------------------------- die "Must specify an input file.\n" unless defined ($gopt_file); die "$gopt_file must be a file.\n" unless (-f $gopt_file); #----------------------------------------------------------------------------- # Do it! #----------------------------------------------------------------------------- # Take care of getting things initialized. StartUp(); # Read in the dictionary. ReadDictionary(); # Output information about this script and environment. PrintScriptInformation(); PrintEnvironmentInformation(); # Output the information about the invocation. PrintInvocationInformation(); # Open the input file in binary mode. WE do it here just once # instead of once in the each of the process functions. my $fh = IO::File->new ($gopt_file); die "Error reading $gopt_file\n" if !defined ($fh); binmode ($fh); # Process the passed in input file. We do this in multiple passes, # building up the information we have based on each pass. ProcessFile_CheckSignatures ($fh); ProcessFile_TextIndicators ($fh); ProcessFile_BinaryIndicators ($fh); ProcessFile_JPEGIndicators ($fh); ProcessFile_ZIPIndicators ($fh); ProcessFile_TextIndicators2 ($fh); ProcessFile_HTMLIndicators ($fh); ProcessFile_Cleanup ($fh); # Output the blocks. We have to do it here as we change the block # indicator in the extract so that we know what we have processed. OutputBlocks() if ($gopt_outputblocks eq 1); # Extract the files from the input file. ExtractFiles ($fh); ExtractHTMLFiles ($fh); ExtractTextFiles ($fh); # Output the details of the extraction. OutputExtractionInfo(); # Close the input file. close ($fh); # Clean everything up. ShutDown(); #----------------------------------------------------------------------------- # Initialize the information for the command line options. #----------------------------------------------------------------------------- sub InitializeOptionInfo { $g_options{"version|v"} = \$gopt_version; $g_optionhelp{"version"} = "Output version information and exit."; $g_options{"help|?"} = \$gopt_help; $g_optionhelp{"help"} = "Display this help and exit."; $g_options{"blocksize|bs:i"} = \$gopt_blocksize; $g_optionhelp{"blocksize"} = "The blocksize to use for matching file ". "signatures. If using a disk image as the input file it is best to ". "use the sector size."; $g_options{"file|f=s"} = \$gopt_file; $g_optionhelp{"file"} = "Specify the input file to process."; $g_options{"author|a:s"} = \$gopt_author; $g_optionhelp{"author"} = "The author for the PDF report."; $g_options{"outputpdf|pdf!"} = \$gopt_outputpdf; $g_optionhelp{"outputpdf"} = "Specify whether to output to PDF in ". "addition to console. Default is to output to both."; $g_options{"title|t:s"} = \$gopt_reporttitle; $g_optionhelp{"title"} = "The title for the PDF report."; $g_options{"subtitle|u:s"} = \$gopt_subtitle; $g_optionhelp{"subtitle"} = "The subtitle for the PDF report."; $g_options{"outputdir|d:s"} = \$gopt_outdirectory; $g_optionhelp{"outputdir"} = "The directory that output files should go ". "to. Default is the current directory."; $g_options{"outputblocks|o!"} = \$gopt_outputblocks; $g_optionhelp{"outputblocks"} = "Whether to output the analysis of all ". "the blocks by this script. Can be lengthy but useful for manual ". "carving. This information does not go to the PDF file. It is ". "intended more as a tool of the forensic analyst than a final ". "product. It is sent to stdout so can be redirected to a file."; } #----------------------------------------------------------------------------- # Output information about the blocks. Basically the type of block we # determined based on the multiple passes through. #----------------------------------------------------------------------------- sub OutputBlocks { # File information. print ("File Information\n\n"); # For all of the blocks in the block array. for (my $block = 0; $block <= $#g_blocks; $block++) { # We group the blocks in lines of 50. if (($block % 2500) eq 0) { # Output the start of a section which gives quick # indication of the column number. print "\n\n "; # Print the column numbers. for (my $col = 0; $col < 50; $col++) { # 2 numbers. printf ("%02d ", $col); } # The end of the column heading. print "\n"; } # We output 50 wide at a time. if (($block % 50) eq 0) { # Output the left of row information (starting block). print "\n"; printf ("%06d : ", $block); } # Output the actual block information. print $g_blocks[$block]." "; } } #----------------------------------------------------------------------------- # Output the details of the extractions we did. #----------------------------------------------------------------------------- sub OutputExtractionInfo { # Details page DCULHelper::PrintPageTitle ("File Information"); # For each of the files we extracted. foreach my $file (sort keys %g_extractdetails) { # Open the file and create an MD5 hash. my $md5 = Digest::MD5->new; open (INFILEMD5, $file) or warn "Cannot open file ".$file, $!; binmode (INFILEMD5); $md5->addfile (*INFILEMD5); close (INFILEMD5); # Output an MD5 hash of the file. DCULHelper::PrintLn ("$file MD5: ".$md5->hexdigest); # Output file information. DCULHelper::PrintLastModCreatedSizeForFile ($file); DCULHelper::PrintLn (""); } # For each file we extracted. foreach my $file (sort keys %g_extractdetails) { # Get the hash entry for the file. my $fileentries = $g_extractdetails{$file}; # Take the string of blocks and make an array. my @blocks = split "\n", $fileentries; # Counter for how many blocks we are outputting. my $blockcount = 0; # Holder for the output string. my $output; # Block title DCULHelper::PrintPageTitle (basename ($file)." blocks/sectors."); # For all the blocks for this file. foreach my $block (@blocks) { # Build up the output string so we can output # more than 1 column at a time. $output .= "$block\t"; # Counter to we know how many columns we have. $blockcount++; # If we have 10 built up, output them. if (($blockcount % 10) eq 0) { # Output the string we built up and reset. DCULHelper::PrintLn ("$output"); $output = ""; } } # If there is still some output to go. if ($output ne "") { # Send it to the output. DCULHelper::PrintLn ("$output"); } } } #----------------------------------------------------------------------------- # Look for file signatures in the input file. We assume that file signatures # are only in the first part of a block. As we expand the number of file # type we recognize this will likely have to become more flexible. # # If we find a signature we fill in the global blocks array for other # functions to use in their analysis. So, this is a quick scan of the input # file as part of a pipeline of scans. #----------------------------------------------------------------------------- sub ProcessFile_CheckSignatures { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Various variables for reading data. my $blocknum = 0; my $buffer; my $bufferread = 0; # While we are still reading the file. while (1) { # Read in a small block of bytes for matching signatures. $bufferread = $fh->sysread ($buffer, SIG_MATCH_SIZE); # Stop this train if we didn't read anything. last if $bufferread eq 0; # Keep track of how much we have read. $g_totalbytesread += $bufferread; # See if we have a signature match. my $sig = $g_siglookup{$buffer}; # If we do have a match, set the block type. if (defined $sig) { # Set the block type. $g_blocks[$blocknum] = $sig; } else { # Otherwise set it to unknown. $g_blocks[$blocknum] = SIG_UNKNOWN; } # Read in a the rest of the block so we can capture how much has # been read. Seek doesn't return how much was read. $bufferread = $fh->sysread ($buffer, $gopt_blocksize - SIG_MATCH_SIZE); # Stop this train if we didn't read anything. last if $bufferread eq 0; # Keep track of how much we have read. $g_totalbytesread += $bufferread; # If the first section of the buffer contains NULL's, we need to # check the rest of the buffer to see if it only contains NULLs. # If we didn't do this check we would assume that all blocks that # began with a few NULL's, only contained NULLs. if ((defined $sig) && ($sig eq SIG_NULL)) { # If the rest of the buffer is NOT all NULLs. if ($buffer !~ /^\x00+$/) { # The block is not just NULLs, hence it is binary. $g_blocks[$blocknum] = BLOCK_BINARY; } } # Next block. $blocknum++; } } #----------------------------------------------------------------------------- # On this pass through the input file we look for indicators of text, either # ASCII or UNICODE. We also examine 2 blocks to see if there is a # continuation based on a dictionary lookup. The current block is compared # with the last text block we encountered. This does not have to be the # previous block as there obviously could be fragmentation involved. We do # assume for this script that continuation happens forward only. #----------------------------------------------------------------------------- sub ProcessFile_TextIndicators { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Local vars to keep track of reading the input file. my $buffer; my $bufferread = 0; my $currentblock = 0; # Various variables for reading data. my $prevtextbuffer = undef; my $prevunicodebuffer = undef; # Go through all of the blocks in the input file. Note that this # information was created from a previous pass through the input file. MAIN: for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # If we marked the block as unknown in a previous pass. if ($g_blocks[$blocknum] eq SIG_UNKNOWN) { # Go straight to this block in the input file. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize, 1); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); $currentblock = $blocknum + 1; # If the buffer only contains text. We determine this by looking # for multiple characters above 0x7E. If there are some then we # don't think this is text. There also needs to be some alpha # characters present. It is not as simple as checking for only # text chars as there are some degenerate cases which we take # a look at in the else clause. if (($buffer !~ /[\x7F-\xFF]{4}/os) && ($buffer =~ /[a-z]/osi)) { # If the buffer looks like unicode text (a sequence of NULL # separated values). if ($buffer =~ /(\x00[^\x00]){5}/os) { # The buffer looks a lot like unicode, assume that unless # we determine that it is actually a continuation from the # previous buffer. $g_blocks[$blocknum] = BLOCK_UNICODE_TEXT; # Does the buffer begin with unicode text? if ($buffer =~ /^(\x00?([a-z]\x00)+)/osi) { # Capture the beginning of this buffer as the # possible end of a word. my $suffix = lc ($1); $suffix =~ s/\x00//osg; # Does the previous text unicode buffer end with text? if ((defined $prevunicodebuffer) && ($prevunicodebuffer =~ /(\x00[a-z])+\x00?$/osi)) { # Capture the end of the previous buffer as the # possible beginning of a word. my $prefix = lc ($1); $prefix =~ s/\x00//osg; # If a word is made lets assume the buffer follows # on from the last. if (exists $g_dict{$prefix.$suffix}) { # Mark the block accordingly. $g_blocks[$blocknum] = BLOCK_UNICODE_CONTINUE_TEXT; } } else { # Look to see if we can make a word just from the # beginning of this block. if (exists $g_dict{$suffix}) { # Check to see if the previous buffer ended # with some whitespace. if ((defined $prevunicodebuffer) && ($prevunicodebuffer =~ /(\x00\s)\x00?$/os)) { # This could well be a new block, but we # give it the benefit of the doubt. $g_blocks[$blocknum] = BLOCK_UNICODE_CONTINUE_TEXT; } } } } } # Does the buffer NOT contain a "word" of at least 5 chars? elsif ($buffer !~ /[a-z]{5}/osi) { # It is binary as far as we can tell. $g_blocks[$blocknum] = BLOCK_BINARY; } # Does the buffer begin with text? elsif ($buffer =~ /^([a-z]+)/osi) { # Capture the beginning of buffer as the end of a word. my $suffix = lc ($1); # Assume text until told otherwise. $g_blocks[$blocknum] = BLOCK_TEXT; # Does the previous text buffer end with text? if ((defined $prevtextbuffer) && ($prevtextbuffer =~ /([a-z]+)$/osi)) { # Capture the end of the previous buffer as the # beginning of a word. my $prefix = lc ($1); # If a word is made lets assume the buffer follows # on from the last. if (exists $g_dict{$prefix.$suffix}) { # It is continue text. $g_blocks[$blocknum] = BLOCK_CONTINUE_TEXT; } } else { # If a word is made. if (exists $g_dict{$suffix}) { # It is continue text. $g_blocks[$blocknum] = BLOCK_CONTINUE_TEXT; } } } # Does the buffer begin with numbers? elsif ($buffer =~ /^[0-9]{2}/osi) { # If we have a previous text buffer and it ends with # numbers, then its a continuation. if ((defined $prevtextbuffer) && ($prevtextbuffer =~ /[0-9]$/osi)) { # Lets say that it is a continuation. $g_blocks[$blocknum] = BLOCK_CONTINUE_TEXT; } else { # Lets just label it as text. $g_blocks[$blocknum] = BLOCK_TEXT; } } # Not text or numbers? else { # Assume text until told otherwise. $g_blocks[$blocknum] = BLOCK_TEXT; # Does the previous text buffer end with text? if ((defined $prevtextbuffer) && ($prevtextbuffer =~ /([a-z]+)$/osi)) { # Capture the end of the previous buffer as the # beginning of a word. my $prefix = lc ($1); # If a word is made lets assume the buffer follows on # from the last. if (exists $g_dict{$prefix}) { # Buffer continues. $g_blocks[$blocknum] = BLOCK_CONTINUE_TEXT; } } else { # If we have a previous text buffer. if (defined $prevtextbuffer) { # Lets be optimistic that this is a continuation. $g_blocks[$blocknum] = BLOCK_CONTINUE_TEXT; } } } # We store this buffer to be used as the previous buffer # next pass through. $prevtextbuffer = $buffer } # Not just text. This is tricky territory. else { # Does the buffer begin with text? if ($buffer =~ /^[a-z]{2,13}\s[a-z]{2,13}\s/osi) { # As binary after text, we indicate this could be the # end of block of text. $g_blocks[$blocknum] = BLOCK_TEXT_END; } else { # Lets label this as binary. $g_blocks[$blocknum] = BLOCK_BINARY; } # We don't have enough to make determination for the # next buffer. $prevtextbuffer = undef; } } } } #----------------------------------------------------------------------------- # On this pass through the input file we look for indicators of text that were # too difficult to determine previously. #----------------------------------------------------------------------------- sub ProcessFile_TextIndicators2 { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Local vars to keep track of reading the input file. my $buffer; my $bufferread = 0; my $currentblock = 0; # Various variables for reading data. my $prevtextbuffer = undef; my $prevunicodebuffer = undef; # Go through all of the blocks in the input file. Note that this # information was created from a previous pass through the input file. MAIN: for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # If we marked the block as unknown in a previous pass. if (($g_blocks[$blocknum] eq SIG_UNKNOWN) || ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_NOT_ZIP) || ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_NOT_JPEG)) { # Go straight to this block in the input file. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize, 1); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); $currentblock = $blocknum + 1; # Let's try a bunch of non-binary to start. if ($buffer =~ /^([\x09-\x7E]{16})/os) { # Get the text part. my $matched = $1; # Check to see that it looks something like real text. if ($matched =~ /[a-z]{3}/osi) { # As binary after text, we indicate this could be the # end of block of text. $g_blocks[$blocknum] = BLOCK_TEXT_END; } else { # Lets label this as binary. $g_blocks[$blocknum] = BLOCK_BINARY; } } } } } #----------------------------------------------------------------------------- # In this pass we go through and look at blocks identified as binary. We # are then trying to further narrow them down into their binary types by # using knowledge of various binary file formats. # # Our knowledge currently consists of some JPEG, ZIP and DOC formats. Each # of these file types present problems due to not knowing how long a file # is. In the case of JPEG, there is no length field as the encoder does not # know how long the compressed data will be until it is done compressing. #----------------------------------------------------------------------------- sub ProcessFile_BinaryIndicators { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Local vars to keep track of reading the input file. my $buffer; my $bufferread = 0; my $currentblock = 0; # Go through all of the blocks in the input file. MAIN: for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # If we marked the block as binary or unknown in a previous pass. if (($g_blocks[$blocknum] eq BLOCK_BINARY) || ($g_blocks[$blocknum] eq BLOCK_UNKNOWN)) { # Skip through the file to the point we want. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize, 1); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); $currentblock = $blocknum + 1; # zip end marker. Needs to be first as there could be other # markers in the same block that would otherwise mask the end. if ($buffer =~ /PK\x05\x06/os) { # Mark the block as possible ZIP end. $g_blocks[$blocknum] = BLOCK_POSSIBLE_ZIP_END; } # These are other indicators for ZIP data. elsif (($buffer =~ /PK\x06\x08/os) || ($buffer =~ /PK\x05\x05/os) || ($buffer =~ /PK\x06\x06/os) || ($buffer =~ /PK\x06\x07/os)) { # Mark it so. $g_blocks[$blocknum] = BLOCK_POSSIBLE_ZIP_DATA; } # Look for the zip local header signatures. These are contained # throughout a zip file for each of the files contained within. elsif ($buffer =~ /PK\x03\x04/os) { # Mark the block as possible ZIP data. $g_blocks[$blocknum] = BLOCK_POSSIBLE_ZIP_DATA; } # zip directory marker. elsif ($buffer =~ /PK\x01\x02/os) { # Mark the block as possible ZIP directory. $g_blocks[$blocknum] = BLOCK_POSSIBLE_ZIP_DIR; } # Look for the end marker for Word Documents. elsif ($buffer =~ /\x38\x00\xF4\x39\xB2\x71\x00\x00/os) { # Mark the block as possible DOC end. $g_blocks[$blocknum] = BLOCK_POSSIBLE_END_DOC; } # Look for continuation on word docs (W o r d D o...). elsif ($buffer =~ /\x57\x00\x6f\x00\x72\x00\x64\x00\x44\x00\x6f/os) { # If the previous block was designated an end block. if ($g_blocks[$blocknum-1] eq BLOCK_POSSIBLE_END_DOC) { # Mark the previous block as possible DOC. $g_blocks[$blocknum-1] = BLOCK_POSSIBLE_DOC; } # Mark the current block as possible DOC. $g_blocks[$blocknum-1] = BLOCK_POSSIBLE_DOC; } # Look for the end marker for JPEGS. elsif ($buffer =~ /^(.*?)\xFF\xD9/os) { # Get the match up to the marker. my $subbuf = $1; # Check to see if we have some incriminating evidence. # The missing D0-D7 is for the restart intervals (RST0-RST7) # which are actually valid for JPEG. if ($subbuf =~ /\xFF[\x01-\xCF\xD8-\xFF]/os) { # Mark the block as NOT possible JPEG image data # as we cannot have FF followed by the characters above. $g_blocks[$blocknum] = BLOCK_POSSIBLE_NOT_JPEG; } else { # Special case the end marker being at the start of a # block. This needs more refinement and is currently # to help with the challenge. Understanding why the # block (95629 (byte offset 48962048)) has this would # help us create a more robust solution. if ($buffer =~ /^\xFF\xD9/os) { # Mark the block as possible JPEG end of image marker. $g_blocks[$blocknum] = BLOCK_POSSIBLE_JPEG_EOI_SOB; } else { # Mark the block as possible JPEG end of image marker. $g_blocks[$blocknum] = BLOCK_POSSIBLE_JPEG_EOI; } } } # Look for xFF followed by x00, a sign that the buffer could be # JPEG image data. The missing D0-D7 is for the restart intervals # (RST0-RST7). elsif (($buffer =~ /\xFF\x00/os) && (($buffer !~ /\xFF[\x01-\xCF\xD8-\xFF]/os))) { # Mark the block as possible JPEG image data. $g_blocks[$blocknum] = BLOCK_POSSIBLE_JPEG_IMAGE; } # Look for xFF followed by NOT x00, a sign that the buffer might # NOT be JPEG image data. The missing D0-D7 is for the restart # intervals (RST0-RST7). elsif ($buffer =~ /\xFF[\x01-\xCF\xD8-\xFF]/os) { # Mark the block as NOT possible JPEG image data. $g_blocks[$blocknum] = BLOCK_POSSIBLE_NOT_JPEG; } } } } #----------------------------------------------------------------------------- # In this pass we take a more in-depth look at the JPEG format, trying to # effectively parse the header of a JPEG file to check for consistency. #----------------------------------------------------------------------------- sub ProcessFile_JPEGIndicators { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Holder for bytes read from the input file. my $firstbyte; my $secondbyte; my $sectionbyte1; my $sectionbyte2; my $hilen; my $lolen; # Position pointers. my $curpos = 0; my $currentblock = 0; my $blocknum = 0; # Go through all of the blocks in the input file. MAIN: while ($blocknum <= $#g_blocks) { # If we marked the block as a JPEG signature previously. if ($g_blocks[$blocknum] eq SIG_JPEG) { # Skip through the file to the point we want and update pointers. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize, 1); $currentblock = $blocknum; $curpos = $blocknum * $gopt_blocksize; # Get the first 2 bytes from the block. $fh->sysread ($firstbyte, 1); $curpos++; $fh->sysread ($secondbyte, 1); $curpos++; # Given our signature is the same as this, we should always get # these bytes. if ((ord ($firstbyte) eq 0xFF) && (ord ($secondbyte) eq 0xD8)) { # Keep going until we are ready to break out due to an # unexpected value or the end of what we are parsing. INNER: while (1) { # Read the section bytes. $fh->sysread ($sectionbyte1, 1); $curpos++; $fh->sysread ($sectionbyte2, 1); $curpos++; # Needs to be the start of a section. if (ord ($sectionbyte1) eq 0xFF) { # If the start of image data. if (ord ($sectionbyte2) eq 0xDA) { # We cannot determine the length of the image data # so we are done with what we can do here. last INNER; } else { # Read the hi and lo length bytes. $fh->sysread ($hilen, 1); $curpos++; $fh->sysread ($lolen, 1); $curpos++; # Calculate the length. my $length = (ord ($hilen) * 256) + ord ($lolen); # Move to that new position in the input file. It # is conceivable that we could look at the block # indicators and skip some of them if we didn't # think they could be JPEG data and that they were # fragments of another file. The problem with # this is that the data inside a JPEG header can # be text, xml, html, and pretty much anything # else you care to imagine. $fh->seek ($length - 2, 1); $curpos += ($length - 2); } } else { # We were not expecting to get here so be done. last INNER; } } } # The end block is the current block. Add 1 as the int function # truncates the division. my $endblock = int($curpos / $gopt_blocksize) + 1; # For all of the blocks from the one after the signature to the # end block, mark as possible JPEG data. for (my $j = $blocknum + 1; $j <= $endblock; $j++) { # Mark accordingly. $g_blocks[$j] = BLOCK_POSSIBLE_JPEG_DATA; } # We can skip the blocks between the signature we found and # the block we ended up at. We can also set current block to # the same value. $blocknum = $endblock; $currentblock = $endblock; # Seek to the end of the current block and update current # position. $fh->seek ($gopt_blocksize - ($curpos % $gopt_blocksize), 1); $curpos += ($gopt_blocksize - ($curpos % $gopt_blocksize)); } # Next block please. $blocknum++ } } #----------------------------------------------------------------------------- # In this pass we take a more in-depth look at the ZIP format, trying to # check that the data within an entry passes it's CRC check. #----------------------------------------------------------------------------- sub ProcessFile_ZIPIndicators { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Position pointers. my $currentblock = 0; my $blocknum = 0; # Go through all of the blocks in the input file. MAIN: while ($blocknum <= $#g_blocks) { # Holders for data read. my $buffer; my @zipblocknum; my @zipdata; # If we marked the block as a ZIP signature previously. if ($g_blocks[$blocknum] eq SIG_ZIP) { # Skip through the file to the point we want and update pointers. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize, 1); $currentblock = $blocknum; # Read the block, putting us at the beginning of the next block. $fh->sysread ($buffer, $gopt_blocksize); $currentblock = $blocknum + 1; # Take care of the data. push (@zipblocknum, $blocknum); push (@zipdata, $buffer); # Need to go to the next block after the SIG to kick things off. $blocknum++; # Continue until we find an end of ZIP. INNER_LOOP: while ($blocknum <= $#g_blocks) { # If the current block is possibly ZIP related data (we use # NOT in the comparison so it is everything but these). if (($g_blocks[$blocknum] ne BLOCK_CONTINUE_TEXT) && ($g_blocks[$blocknum] ne BLOCK_TEXT) && ($g_blocks[$blocknum] ne BLOCK_TEXT_END) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML_START) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML_END) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_NOT_ZIP) && ($g_blocks[$blocknum] ne BLOCK_UNICODE_CONTINUE_TEXT) && ($g_blocks[$blocknum] ne BLOCK_UNICODE_TEXT)) { # Skip through the file to the point we want. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize , 1); $currentblock = $blocknum; # Read the block, putting us at the beginning of # the next block. $fh->sysread ($buffer, $gopt_blocksize); $currentblock = $blocknum + 1; # Take care of the data. push (@zipblocknum, $blocknum); push (@zipdata, $buffer); # If this is the end of the zip. if ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_ZIP_END) { # We don't need to keep going on this one. last INNER_LOOP; } } # Next block please. $blocknum++ } # If we found the end of ZIP, we can now process the ZIP file. if ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_ZIP_END) { # Check the ZIP file. CheckZIPForConsistency (\@zipblocknum, \@zipdata); } } # Next block please. $blocknum++ } } #----------------------------------------------------------------------------- # Go through the passed in ZIP file and see if we can fix it if there are # fragments present inside it. This function takes a simplistic approach # that is tailored to the challenge. This would need a lot of work to cope # with all scenarios. #----------------------------------------------------------------------------- sub CheckZIPForConsistency { # Holders for passed in params. my $zipblocknumref = $_[0]; my @zipblocknum = @$zipblocknumref; my $zipdataref = $_[1]; my @zipdata = @$zipdataref; # Make a single buffer of the zip data. my $zipdatabuffer = join "", @zipdata; # Holders of various positions. my $startpos = 0; my $curpos = 0; my $compstartpos = 0; my $compendpos = 0; # While there is still enough data to read. 60 is a magic number # for now until a better solution is found. MAIN_LOOP: while (length ($zipdatabuffer) > 60) { # Get the start position for this iteration. $startpos = $curpos; # Get the local header. my ($localHeader, $version, $fstype, $gpFlags, $compression_method, $lastModTime, $lastModDate, $crc, $compressed_size, $size, $nameLength, $extraLength) = unpack ("VCCvvvvVVVvv", $zipdatabuffer); # The local header should always be these (PK 03 04 / PK 01 02). if (($localHeader ne 0x4034B50) && ($localHeader ne 0x02014b50)) { # Should never happen, but just in case. warn ("$localHeader Error reading the local header."); return; } # Move on in the data past the local header. $zipdatabuffer = substr ($zipdatabuffer, 30); $curpos += 30; # Move on past the name and extra data. $zipdatabuffer = substr ($zipdatabuffer, $nameLength + $extraLength); $curpos += ($nameLength + $extraLength); # Set the compression start position to the current. $compstartpos = $curpos; # Holder for the compressed data. my $compresseddata; # If the General Purpose bit flags indicate a data descriptor, we # are going to manually find the end so that we can compare the # amount of compressed data with what the zip file meta information # says so that possibly we can recover it. if ($gpFlags & 8) { # Look for the section that comes after the compressed data. if ($zipdatabuffer =~ /^(.*?)PK\x07\x08/os) { # Set the compressed data. $compresseddata = $1; } else { # We cannot recover from this. last MAIN_LOOP; } } else { # Look for the section that comes after the compressed data. if ($zipdatabuffer =~ /^(.*?)PK\x03\x04/os) { # Set the compressed data. $compresseddata = $1; } else { # We cannot recover from this. last MAIN_LOOP; } } # Move on past the compressed data. $zipdatabuffer = substr ($zipdatabuffer, length ($compresseddata)); $curpos += (length ($compresseddata)); # Set the compression end position to the current. $compendpos = $curpos; # If the General Purpose bit flags indicate a data descriptor. if ($gpFlags & 8) { # Get the data descriptor data. my ($localHeader2, $crc2, $compressed_size2, $size2) = unpack ("VVVV", $zipdatabuffer); # Set the CRC and compressed size from this data descriptor. $crc = $crc2; $compressed_size = $compressed_size2; } # If the length of the compressed data is not the same as the meta # information then there may be something we can do. if (length ($compresseddata) ne $compressed_size) { # Figure out the difference. my $difference = (length ($compresseddata) - $compressed_size); # If the actual data is larger and a multiple of the disk block # size we are going to try to see if we can eliminate some of the # blocks. if (($difference > 0) && (($difference % $gopt_blocksize) eq 0)) { # See if we can mark the extra blocks as non-zip. MarkNonZIPBlocks ($zipblocknumref, $zipdataref, $compstartpos, $compendpos, $difference / $gopt_blocksize, $crc); } } # If the General Purpose bit flags indicate a data descriptor. if ($gpFlags & 8) { # Skip over the data descriptor. $zipdatabuffer = substr ($zipdatabuffer, 16); $curpos += 16; } } } #----------------------------------------------------------------------------- # See if we can go through and find the required blocks that make up this # part of the zip file by using the passed in params which include the CRC # of the uncompressed data. # # For this version we only remove consecutive blocks. It is possible that # fragmented blocks would not be consecutive. This will be done in a future # release. #----------------------------------------------------------------------------- sub MarkNonZIPBlocks { # Holders for passed in params. my $zipblocknumref = $_[0]; my @zipblocknum = @$zipblocknumref; my $zipdataref = $_[1]; my @zipdata = @$zipdataref; my $compstartpos = $_[2]; my $compendpos = $_[3]; my $numblocksremove = $_[4]; my $targetcrc = $_[5]; # Various variables to make future computations easier. my $compstartblock = int ($compstartpos / $gopt_blocksize); my $compendblock = int ($compendpos / $gopt_blocksize) + 1; my $compstartposrem = $compstartpos % $gopt_blocksize; my $compendposrem = $compendpos % $gopt_blocksize; my $compstart = substr ($zipdata[$compstartblock], $compstartposrem); my $compend = substr ($zipdata[$compendblock], 0, $compendposrem); # We start off with not having found a block to remove. my $startremovepos = -1; # For all the blocks up to the end minus the number of blocks we are # going to remove. MAIN_LOOP: for (my $i = ($compstartblock + 1); $i < ($compendblock - $numblocksremove); $i++) { # Holder for buffers. my $buffer; my $tempbuffer; # For all the possible positions. for (my $j = ($compstartblock + 1); $j < $compendblock; $j++) { # Moving window of consecutive blocks. if (($j < $i) || ($j >= ($i + $numblocksremove))) { # Concatenate this one. $tempbuffer = $zipdata[$j]; $buffer .= $tempbuffer; } } # The buffer is the first part, the middle, and the end. my $buffertocrc = $compstart.$buffer.$compend; # Initialize the inflater. my ($x, $status) = inflateInit ('-WindowBits' => -MAX_WBITS()); # Check the return. if ($status ne Z_OK) { # Let the user know as this could be a config error. warn ("Could not initialize inflate."); return; } # Inflate as we need to CRC the inflated buffer. my ($output1, $status1) = $x->inflate (\$buffertocrc) ; # If there was success, work on the result. If there was not # success we ignore it. It is quite likely given what we are # doing (moving blocks around) that we could create zips that # are illegal. if ($status1 == Z_OK || $status1 == Z_STREAM_END) { # Calculate the CRC of the data. my $crc = 0; $crc = Compress::Zlib::crc32 ($output1, $crc); # If the blocks we chose match by CRC then we found a # good combination, so we can stop. if ($crc eq $targetcrc) { # Save the position and quit the loop. $startremovepos = $i; last MAIN_LOOP; } } } # If we found a combination of blocks to remove that worked. if ($startremovepos ne -1) { # Holder for the block number to remove. my $tomark = 0; # Go through the consecutive blocks to remove. for (my $k = $startremovepos; $k < ($startremovepos + $numblocksremove); $k++) { # Get the block number we need. $tomark = $zipblocknum[$k]; # Mark the block as NOT possible ZIP image data. $g_blocks[$tomark] = BLOCK_POSSIBLE_NOT_ZIP; } } } #----------------------------------------------------------------------------- # In this pass we go through the blocks in the input file looking for some # HTML indicators. We only examine blocks identified as text rather than # continuations. The link of a continuation is stronger than the type of # data that it contains. #----------------------------------------------------------------------------- sub ProcessFile_HTMLIndicators { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Local vars to keep track of reading the input file. my $buffer; my $bufferread = 0; my $currentblock = 0; # Go through all of the blocks in the input file. MAIN: for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # If we marked the block as text in a previous pass. We are not # also looking at CONTINUE_TEXT as that is a good indicator that # the block follows on from the previous, whether it be HTML or # just plain text. if (($g_blocks[$blocknum] eq BLOCK_TEXT) || ($g_blocks[$blocknum] eq BLOCK_CONTINUE_TEXT) || ($g_blocks[$blocknum] eq BLOCK_TEXT_END)) { # Skip through the file to the point we want. $fh->seek (($blocknum - $currentblock) * $gopt_blocksize, 1); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); $currentblock = $blocknum + 1; # Look for tell-tale HTML signs. We don't care about tags that # span over blocks as there really should be enough tags to fit # inside the block. if ($buffer =~ /\<(\/)?[a-z0-9]+\>/ios) { # Look for the beginning of HTML markup. if ($buffer =~ /\ 0) { # If any of the next 5 blocks are also possibly not JPEG. # From a quick analysis, we have seen that a 0xFF followed # by an illegal (JPEG) byte happens about every 5 blocks. # This is an area that can be improved with more experience. if (($g_blocks[$blocknum+1] eq BLOCK_POSSIBLE_NOT_JPEG) || ($g_blocks[$blocknum+2] eq BLOCK_POSSIBLE_NOT_JPEG) || ($g_blocks[$blocknum+3] eq BLOCK_POSSIBLE_NOT_JPEG) || ($g_blocks[$blocknum+4] eq BLOCK_POSSIBLE_NOT_JPEG) || ($g_blocks[$blocknum+5] eq BLOCK_POSSIBLE_NOT_JPEG)) { # Mark the block as NOT possible JPEG image data. $g_blocks[$blocknum] = BLOCK_POSSIBLE_NOT_JPEG; } } } else { # Reset the possibly not JPEG run count if not binary. $notjpegrun = 0; } } } #----------------------------------------------------------------------------- # Extract the files we have found from the input file. This simple function # goes through all of the blocks until a signature block is found. Based on # that signature an extract function is found in the sig2fn lookup. That # function is then called with the input file handle and also the current # block number. # # Note that each extract function does absolute seeks as they cannot depend # on what the other extract functions have done. This could be fixed for a # future release as it will likely impact larger image files. # # Each of the extract functions marks a block as processed if they deal with # it. That way we can just go through all blocks without regard to skipping # them here. We assume that each extract function is responsible in what it # marks as processed. #----------------------------------------------------------------------------- sub ExtractFiles { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Go through all of the blocks in the input file. for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # See if there is a function for the signature of this block. my $reffn = $g_sig2fn{$g_blocks[$blocknum]}; # If there is a function defined for this signature. if (defined $reffn) { # Call the function and pass in the appropriate information. $reffn->($fh, $blocknum); } } } #----------------------------------------------------------------------------- # Extract JPEGs from the input file. Go through all of the blocks looking # for JPEG signatures and then extracting the contents once found. #----------------------------------------------------------------------------- sub ExtractJPEG { # Get the parameters that were passed in. my $fh = $_[0]; my $startblocknum = $_[1]; # Add to the stack that we are processing a JPEG. push (@g_extractstack, SIG_JPEG); # Create a temporary file output handle and make binary. my ($name, $tempfh) = CreateTempFile ("jpg"); binmode ($tempfh); # Holder for the buffer and amount read. my $buffer = ""; my $bufferread = 0; # Go to the block we need from the start of the input file. $fh->seek ($startblocknum * $gopt_blocksize, 0); # Read in the first block which is the signature for our file type. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Output the current block to the output file and add to the log # of what was extracted. print $tempfh $buffer; $g_extractdetails{$name} .= "$startblocknum"; # Mark the current block as processed. $g_blocks[$startblocknum] = BLOCK_PROCESSED; # Go through all of the blocks. MAIN: for (my $blocknum = $startblocknum + 1; $blocknum <= $#g_blocks; $blocknum++) { # Skip the block if it has already been processed. next MAIN if ($g_blocks[$blocknum] eq BLOCK_PROCESSED); # Look to see if the current block has a signature we recognize. my $reffn = $g_sig2fn{$g_blocks[$blocknum]}; # If the current block has a recognized signature then we defer our # processing to this other function. if (defined $reffn) { # Call the signatures function. $reffn->($fh, $blocknum); next MAIN; } # Do some extra checking if this is is a JPEG end of image and # it was found as the first 2 bytes of a block. if ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_JPEG_EOI_SOB) { # If the next block has a signature. if (defined $g_sig2fn{$g_blocks[$blocknum+1]}) { # Defer to the signature by setting this one to EOI. $g_blocks[$blocknum] = BLOCK_POSSIBLE_JPEG_EOI; } else { # For all of the entries on the stack EXCEPT the last one (<). INNER: for (my $stacknum = 0; $stacknum < $#g_extractstack; $stacknum++) { # If we have a nested JPEG extraction going on. if ($g_extractstack[$stacknum] eq SIG_JPEG) { # Defer to the signature by setting to EOI. $g_blocks[$blocknum] = BLOCK_POSSIBLE_JPEG_EOI; # Don't need to keep checking although this would # likely be a small array. last INNER; } } } # Make sure the next block is usable for us before we use it. if (($g_blocks[$blocknum+1] eq BLOCK_POSSIBLE_JPEG_DATA) || ($g_blocks[$blocknum+1] eq BLOCK_BINARY) || ($g_blocks[$blocknum+1] eq BLOCK_POSSIBLE_JPEG_IMAGE) || ($g_blocks[$blocknum+1] eq BLOCK_POSSIBLE_JPEG_EOI)) { # Next please. We don't save this block out by # going back through the main loop. next MAIN; } else { # Defer to the signature by setting to EOI. $g_blocks[$blocknum] = BLOCK_POSSIBLE_JPEG_EOI; } } # If the current block is possibly JPEG related data. if (($g_blocks[$blocknum] eq BLOCK_POSSIBLE_JPEG_DATA) || ($g_blocks[$blocknum] eq BLOCK_BINARY) || ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_JPEG_IMAGE) || ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_NOT_ZIP) || ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_JPEG_EOI)) { # Skip through the file to the point we want. $fh->seek ($blocknum * $gopt_blocksize, 0); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # If this is the end of the image. if ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_JPEG_EOI) { # Get everything from the start of the block to the end # of image marker. We don't want to include the slack # space when we output the file. if ($buffer =~ /^(.*\xFF\xD9)/os) { # Set the buffer to just the part of the JPEG file # before the end of image marker. $buffer = $1; } } # Output the current block to the output file. print $tempfh $buffer; $g_extractdetails{$name} .= "\n$blocknum"; # Save what the block was before we are going to change it. my $blocktype = $g_blocks[$blocknum]; # Mark the current block as processed. $g_blocks[$blocknum] = BLOCK_PROCESSED; # We don't need to continue if we're at the end of image. last MAIN if ($blocktype eq BLOCK_POSSIBLE_JPEG_EOI); } # This could happen all the time if fragmentation occured. # Left the print so that could use for debugging later. else { #print STDERR "* $blocknum : ".$g_blocks[$blocknum]."\n"; } } # Close the output file. close ($tempfh); # Pop the last item off the stack. my $lastitem = pop (@g_extractstack); # If the last item was not ours then we have an issue. if ($lastitem ne SIG_JPEG) { # It's not the end of the world. Just let people know. warn ("Error: Removed incorrect entry on the stack"); } } #----------------------------------------------------------------------------- # Extract ZIPs from the input file. Go through all of the blocks looking # for ZIP signatures and then extracting the contents once found. # We are somewhat naive here in that we assume that the contents of a zip # file cannot be text. This is the case in the challenge, but will need to # be changed in the future. It could be done at the same time as having # flags for block indicators. #----------------------------------------------------------------------------- sub ExtractZIP { # Get the parameters that were passed in. my $fh = $_[0]; my $startblocknum = $_[1]; # Add to the stack that we are processing a ZIP. push (@g_extractstack, SIG_ZIP); # Create a temporary file output handle and make binary. my ($name, $tempfh) = CreateTempFile ("zip"); binmode ($tempfh); # Holder for the buffer and amount read. my $buffer = ""; my $bufferread = 0; # Go to the block we need from the start of the input file. $fh->seek ($startblocknum * $gopt_blocksize, 0); # Read in the first block which is the signature for our file type. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Output the current block to the output file and log that we are # outputting that block. print $tempfh $buffer; $g_extractdetails{$name} .= "$startblocknum"; # Mark the current block as processed. $g_blocks[$startblocknum] = BLOCK_PROCESSED; # Go through all of the blocks. MAIN: for (my $blocknum = $startblocknum + 1; $blocknum <= $#g_blocks; $blocknum++) { # Skip the block if it has already been processed. next if ($g_blocks[$blocknum] eq BLOCK_PROCESSED); # Look to see if the current block has a signature we recognize. my $reffn = $g_sig2fn{$g_blocks[$blocknum]}; # If the current block has a recognized signature then we defer our # processing to this other function. if (defined $reffn) { # Call the signatures function. $reffn->($fh, $blocknum); next; } # If the current block is possibly ZIP related data (we use # NOT in the comparison so it is everything but these). if (($g_blocks[$blocknum] ne BLOCK_CONTINUE_TEXT) && ($g_blocks[$blocknum] ne BLOCK_TEXT) && ($g_blocks[$blocknum] ne BLOCK_TEXT_END) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML_START) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML_END) && ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_NOT_ZIP) && ($g_blocks[$blocknum] ne BLOCK_UNICODE_CONTINUE_TEXT) && ($g_blocks[$blocknum] ne BLOCK_UNICODE_TEXT)) { # Skip through the file to the point we want. $fh->seek ($blocknum * $gopt_blocksize, 0); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # If this is the end of the zip. if ($g_blocks[$blocknum] eq BLOCK_POSSIBLE_ZIP_END) { # Get everything from the start of the block to the end # we don't want to include the slack space when we output # the file. # # Note, we are assuming no ZIP comment at the end of the # ZIP due to time constraints in the challenge. if ($buffer =~ /^(.*PK\x05\x06.{18})/os) { # Capture the part of the buffer we found. my $endofzip = $1; # Set the buffer to just the part of the ZIP file # before the end marker. $buffer = $1; } } # Output the current block to the output file. print $tempfh $buffer; $g_extractdetails{$name} .= "\n$blocknum"; # Save what the block was before we are going to change it. my $blocktype = $g_blocks[$blocknum]; # Mark the current block as processed. $g_blocks[$blocknum] = BLOCK_PROCESSED; # We don't need to continue if we're at the end of image. last MAIN if ($blocktype eq BLOCK_POSSIBLE_ZIP_END); } # This could happen all the time if fragmentation occured. # Left the print so that could use for debugging later. else { #print STDERR "* $blocknum : ".$g_blocks[$blocknum]."\n"; } } # Close the output file. close ($tempfh); # Pop the last item off the stack. my $lastitem = pop (@g_extractstack); # If the last item was not ours then we have an issue. if ($lastitem ne SIG_ZIP) { # It's not the end of the world. Just let people know. warn ("Error: Removed incorrect entry on the stack"); } } #----------------------------------------------------------------------------- # Extract DOCs from the input file. Go through all of the blocks looking # for DOC signatures and then extracting the contents once found. #----------------------------------------------------------------------------- sub ExtractMSODoc { # Get the parameters that were passed in. my $fh = $_[0]; my $startblocknum = $_[1]; # Go to the block after the signature so we can check something. $fh->seek (($startblocknum + 1) * $gopt_blocksize, 0); # Holder for the buffer and amount read. my $buffer = ""; my $bufferread = 0; # Read in the first block after the signature. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Look for a tell-tale sign of a block that is second after the sig. # We do this to prevent a wild goose chase. There might be a better # way to do this if we can really understand the DOC file format. if ($buffer !~ /\x00{10}/os) { # We are not going to pursue this one. return; } # Add to the stack that we are processing a DOC. push (@g_extractstack, SIG_MSODOC); # Create a temporary file output handle and make binary. my ($name, $tempfh) = CreateTempFile ("doc"); binmode ($tempfh); # Go to the block we need from the start of the input file. $fh->seek ($startblocknum * $gopt_blocksize, 0); # Read in the first block which is the signature for our file type. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Output the current block to the output file and log that we are # outputting this block. print $tempfh $buffer; $g_extractdetails{$name} .= "$startblocknum"; # Mark the current block as processed. $g_blocks[$startblocknum] = BLOCK_PROCESSED; # Go through all of the blocks. MAIN: for (my $blocknum = $startblocknum + 1; $blocknum <= $#g_blocks; $blocknum++) { # Skip the block if it has already been processed. next if ($g_blocks[$blocknum] eq BLOCK_PROCESSED); # Look to see if the current block has a signature we recognize. my $reffn = $g_sig2fn{$g_blocks[$blocknum]}; # If the current block has a recognized signature then we defer our # processing to this other function. if (defined $reffn) { # Call the signatures function. $reffn->($fh, $blocknum); next; } # If the current block is possibly DOC related data. We don't have # a way currently to determine it. Docs can contain binary and # text data of most kinds. if (1) { # Skip through the file to the point we want. $fh->seek ($blocknum * $gopt_blocksize, 0); # Read the block, putting us at the beginning of the next block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Output the current block to the output file. We would normally # look for the actual end of the file, but at this point cannot # figure out how to determine that. print $tempfh $buffer; $g_extractdetails{$name} .= "\n$blocknum"; # Save what the block was before we are going to change it. my $blocktype = $g_blocks[$blocknum]; # Mark the current block as processed. $g_blocks[$blocknum] = BLOCK_PROCESSED; # We don't need to continue if we're at the end of image. last MAIN if ($blocktype eq BLOCK_POSSIBLE_END_DOC); } # This could happen all the time if fragmentation occured. # Left the print so that could use for debugging later. else { #print STDERR "* $blocknum : ".$g_blocks[$blocknum]."\n"; } } # Close the output file. close ($tempfh); # Pop the last item off the stack. my $lastitem = pop (@g_extractstack); # If the last item was not ours then we have an issue. if ($lastitem ne SIG_MSODOC) { # It's not the end of the world. Just let people know. warn ("Error: Removed incorrect entry on the stack"); } } #----------------------------------------------------------------------------- # Extract the HTML files from the image file. #----------------------------------------------------------------------------- sub ExtractHTMLFiles { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Holder for the buffer and amount read. my $buffer = ""; my $bufferread = 0; # Go through all of the blocks in the input file. MAIN: for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # Skip the block if it is not a start of HTML block. next if ($g_blocks[$blocknum] ne BLOCK_POSSIBLE_HTML_START); # Create a temporary file output handle and make binary. my ($name, $tempfh) = CreateTempFile ("HTML"); binmode ($tempfh); # Go to the block we need from the start of the input file. # TODO : do from current position. $fh->seek ($blocknum * $gopt_blocksize, 0); # Read in the first block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Output the current block to the output file and log that we are # outputting this block. print $tempfh $buffer; $g_extractdetails{$name} .= "$blocknum"; # Mark the current block as processed. $g_blocks[$blocknum] = BLOCK_PROCESSED; # Go through all of the blocks looking for the rest of the HTML. INNER: for (my $innerblocknum = $blocknum + 1; $innerblocknum <= $#g_blocks; $innerblocknum++) { # Check that it is a block type we care about. next if (($g_blocks[$innerblocknum] ne BLOCK_CONTINUE_TEXT) && ($g_blocks[$innerblocknum] ne BLOCK_TEXT) && ($g_blocks[$innerblocknum] ne BLOCK_TEXT_END) && ($g_blocks[$innerblocknum] ne BLOCK_POSSIBLE_HTML) && ($g_blocks[$innerblocknum] ne BLOCK_POSSIBLE_HTML_END)); # Go to the block we need from the start of the input file. # TODO - do this from current position. $fh->seek ($innerblocknum * $gopt_blocksize, 0); # Read in the block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # If this is the end of the HTML. if ($g_blocks[$innerblocknum] eq BLOCK_POSSIBLE_HTML_END) { # Get everything from the start of the block to the end # of HTML and accompanying text. if ($buffer =~ /^(.*\<\/html[a-z0-9\<\>_]*)/ios) { # Set the buffer to just the part of the HTML file. $buffer = $1; } } # Output the current block to the output file and log that we are # outputting this block. print $tempfh $buffer; $g_extractdetails{$name} .= "\n$innerblocknum"; # Save the block type before changing it. my $blocktype = $g_blocks[$innerblocknum]; # Mark the current block as processed. $g_blocks[$innerblocknum] = BLOCK_PROCESSED; # We don't need to continue if we're at the end of HTML. last INNER if ($blocktype eq BLOCK_POSSIBLE_HTML_END); } } } #----------------------------------------------------------------------------- # Extract the text files from the image file. #----------------------------------------------------------------------------- sub ExtractTextFiles { # Get the file handle for the input file and go to the beginning. my $fh = $_[0]; $fh->seek (0, 0); # Holder for the buffer and amount read. my $buffer = ""; my $bufferread = 0; # Go through all of the blocks in the input file. MAIN: for (my $blocknum = 0; $blocknum <= $#g_blocks; $blocknum++) { # Skip the block if it is not a start of text block. next if ($g_blocks[$blocknum] ne BLOCK_TEXT); # Create a temporary file output handle and make binary. my ($name, $tempfh) = CreateTempFile ("TXT"); binmode ($tempfh); # Go to the block we need from the start of the input file. # TODO : do from current position. $fh->seek ($blocknum * $gopt_blocksize, 0); # Read in the first block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # Output the current block to the output file and log that we are # outputting this block. print $tempfh $buffer; $g_extractdetails{$name} .= "$blocknum"; # Mark the current block as processed. $g_blocks[$blocknum] = BLOCK_PROCESSED; # Go through all of the blocks looking for the rest of the text. INNER: for (my $innerblocknum = $blocknum + 1; $innerblocknum <= $#g_blocks; $innerblocknum++) { # Check that it is a block type we care about. next if (($g_blocks[$innerblocknum] ne BLOCK_CONTINUE_TEXT) && ($g_blocks[$innerblocknum] ne BLOCK_TEXT) && ($g_blocks[$innerblocknum] ne BLOCK_TEXT_END)); # Go to the block we need from the start of the input file. # TODO - do this from current position. $fh->seek ($innerblocknum * $gopt_blocksize, 0); # Read in the block. $bufferread = $fh->sysread ($buffer, $gopt_blocksize); # If this is the end of the text. if ($g_blocks[$innerblocknum] eq BLOCK_TEXT_END) { # Get everything from the start of the block to the end # of accompanying text. if ($buffer =~ /^([0x00-0x7E]*)/ios) { # Set the buffer to just the part of the text file. $buffer = $1; } } # Output the current block to the output file and log that we are # outputting this block. print $tempfh $buffer; $g_extractdetails{$name} .= "\n$innerblocknum"; # Save the block type before changing it. my $blocktype = $g_blocks[$innerblocknum]; # Mark the current block as processed. $g_blocks[$innerblocknum] = BLOCK_PROCESSED; # We don't need to continue if we're at the end of text. last INNER if ($blocktype eq BLOCK_TEXT_END); } } } #----------------------------------------------------------------------------- # Create a temporary file using the passed in extension. #----------------------------------------------------------------------------- BEGIN { # "Static" variable so that we always increment filename # for this invocation. my $number = 1; #------------------------------------------------------------------------- # Create a temporary file. #------------------------------------------------------------------------- sub CreateTempFile { # Get the passed in extension. my $ext = $_[0]; # Holder for the handle and name. my $tempfh; my $name = ""; do { # Pad with zeros. my $paddednumber = sprintf ("%05d", $number); # Create the name and also increment count. $name = cwd()."/$gopt_outdirectory/$paddednumber.$ext"; $number++; } # Until we can create the file successfully. until $tempfh = IO::File->new ($name, O_RDWR | O_CREAT | O_EXCL); # Return what we created. return ($name, $tempfh); } } #----------------------------------------------------------------------------- # Do things that need to happen at the start of the script. #----------------------------------------------------------------------------- sub StartUp { # Get the time that we started doing the actual processing. $g_starttime = time(); $g_starttimetext = DCULHelper::FormatDateTime ($g_starttime); # If the user wants PDF output. if ($gopt_outputpdf) { # Start with the script filename. $g_pdffilename = $g_thisscriptbasename; # Remove the extension. $g_pdffilename =~ s/\..*$/ /; # Add the time and date. $g_pdffilename .= $g_starttimetext; # Remove spaces and :'s. $g_pdffilename =~ tr/ \t\:/_/; # Add the extension. $g_pdffilename .= ".pdf"; # Initialize the PDF helper. PDFHelper::StartUp ($g_pdffilename, $gopt_author, "$g_thisscriptbasename Report", $gopt_subtitle, $gopt_reporttitle); } } #----------------------------------------------------------------------------- # Do things that need to happen at the end of the script. #----------------------------------------------------------------------------- sub ShutDown { # Output a new page title for the summary. DCULHelper::PrintPageTitle ("Summary"); # Get the end time. my $endtime = time(); # Output the time taken. my $totaltime = $endtime - $g_starttime; DCULHelper::PrintLn ("Total time taken: $totaltime second(s)"); # Output the total bytes read from the input file. DCULHelper::PrintLn ("Input file total bytes read: $g_totalbytesread"); # Get the number of files extracted and output. my $numfilesextracted = keys (%g_extractdetails); DCULHelper::PrintLn ("Number of files extracted: $numfilesextracted"); # If the user wanted PDF output. if ($gopt_outputpdf) { # Output the source for the script. Don't need to do this for the # console output as the user could output it whenever they wanted. DCULHelper::OutputScriptSource (\@g_scriptlines); # Cleanup the PDF helper. PDFHelper::ShutDown();; } } #----------------------------------------------------------------------------- # Read in the dictionary. #----------------------------------------------------------------------------- sub ReadDictionary { # Open the dictionary file. We choose to die otherwise the user # might not see the warning message and assume that the script did # the best it could. open (INFILE, "wordlist.txt") or die "Cannot open dictionary: ", $!; # Holder for each of the lines from dictionary. my @lines; # Read in and chomp off the CR. chomp (@lines = ); # We can close the file now. close (INFILE); # Add each of the words read into the dictionary. foreach my $line (@lines) { # Add to the dictionary hash. $g_dict{lc ($line)} = 1; } } #----------------------------------------------------------------------------- # Print out useful information about this script. #----------------------------------------------------------------------------- sub PrintScriptInformation { # Script information page. DCULHelper::PrintPageTitle ("Script Information"); # Output the script version number. DCULHelper::PrintLn ("$g_thisscriptbasename version number: ". VERSION_NUMBER); # Read in this script so can create MD5 hash and output the # source if we wanted. open (SCRIPTFILE, $g_thisscript) or warn "Cannot open file ". $g_thisscript, $!; binmode (SCRIPTFILE); @g_scriptlines = ; close (SCRIPTFILE); # Put the script into a handy string my $scriptsource = join "", @g_scriptlines; # Calculate the script's MD5 hash and output. DCULHelper::PrintLn ("$g_thisscriptbasename MD5: ". md5_hex ($scriptsource)); # Output the last modified, created and size information for the script. DCULHelper::PrintLastModCreatedSizeForFile ($g_thisscript); } #----------------------------------------------------------------------------- # Print out useful information about this invocation. #----------------------------------------------------------------------------- sub PrintInvocationInformation { # Invocation information page. DCULHelper::PrintPageTitle ("Invocation"); # Output the invocation information. DCULHelper::PrintLn ("Started: $g_starttimetext"); DCULHelper::PrintLn ("Command line: $g_cmdline"); DCULHelper::PrintLn ("Current working directory: ".cwd()); # Output the file that PDF output is being output to should # it be actually being output. DCULHelper::PrintLn ("PDF Output: $g_pdffilename") if $gopt_outputpdf; # Output the input file's MD5 hash. DCULHelper::PrintMD5HashofInputFile ($gopt_file); # Output the last modified, created and size information # for the input file. DCULHelper::PrintLastModCreatedSizeForFile ($gopt_file); } #----------------------------------------------------------------------------- # Print out useful information about the environment. This information is # readily available from the command line so is only an option for outputting # to the PDF file. #----------------------------------------------------------------------------- sub PrintEnvironmentInformation { # If the user requested PDF output. if ($gopt_outputpdf) { # Output the environment information. DCULHelper::PrintEnvironmentInformation(); } } #----------------------------------------------------------------------------- # Output the usage of this script. #----------------------------------------------------------------------------- sub PrintUsage { # Get the text for the required invocation. my $requiredinvocation = DCULHelper::GetRequiredOptionUsage (\%g_options); # Get the text for the options. my $commandlineoptions = DCULHelper::GetOptionUsage (\%g_options, \%g_optionhelp); print<< "ENDOFUSAGE"; Usage: $g_thisscriptbasename $requiredinvocation [OPTIONS] This script ferrets out files from a given disk image file. It uses a number of techniques to ensure the best possible block sequence based on a fragmented image. For more technical information please feel free to read the source code which has lots of comments. Command-line options: $commandlineoptions Examples: This is the basic invocation of this script: $g_thisscriptbasename --file=image.dat Same usage of the script, this time requesting outputting the block map. $g_thisscriptbasename --file=image.dat --outputblocks Specifying a blocks size of 1024 bytes instead of the default. $g_thisscriptbasename --file=image.dat --bs=1024 Specifying a different output directory than the current directory.. $g_thisscriptbasename --file=image.dat --outputdir=\\evidence\\out ENDOFUSAGE } #----------------------------------------------------------------------------- # Output the version information for this script. #----------------------------------------------------------------------------- sub PrintVersion { print<< "ENDOFVERSION"; $g_thisscriptbasename @{[ VERSION_NUMBER ]} Written by John Goalby. Part of the Data Carving Utility Library (DCUL). http://www.sftsrc.com/DCUL/ Copyright (C) 2006 SoftSource Consulting. ENDOFVERSION } #----------------------------------------------------------------------------- # The end of the road. #-----------------------------------------------------------------------------