######################################################################## # recover-rfc-2822.pl 0.1 (pre-alpha), (c) Copyright 2007 by # Golden G. Richard III. # # Attempt to recover RFC 2822+MIME-compliant email messages from a # disk image, using a subset of RFC 2822 and MIME email rules to # detect the beginning and end of email messages. # Messages are created in the directory "email-output" by default. # This can be overridden with the "-o" command line switch. Filenames # for recovered email are of the form # "email-startbyte-endbyte.txt", where startbyte and endbyte # are the starting and ending bytes where the email message # was detected. # # RFC 2822 requires that email messages be no more than 1000 # characters per line (incl/ CR+LF), but recommends a maximum line # length of 80 characters (incl/ CR+LF). The default limit is 1000 # characters per line. This can be overridden with the "-n" command # line option. # # The image file to process is specified using the "-f" option. This # option is required. ######################################################################## ############ ### ** LIMITATIONS (TO DO) ** ### o Supporting "binary" content encoding type is going to ### require a non-line oriented approach?? ### ### o There's an implicit assumption that Content-Type: and ### Content-Transfer-Encoding: will appear in pairs and that the ### stacks that track these values in multi-part messages won't ### de-synchronize. ### ### o Need "open to header processing" flags so that bodies that ### contain text that matches a header definition won't be processed ### incorrectly. Need to be "open to header processing" while a blank ### line (which terminates a block of header lines) isn't seen. ### ### o all_chars_in_charset should cache parsed charset definitions rather than ### recomputing the ranges each time it's called. ### ### o Need a switch to allow 8 bit characters in header fields, absent ### a watchdog inserting warning headers. Mark with special filenames ### for easy examination. ### ### o Need configuration files for non-printable alerts and allowed ### non-standard header lines ### ### o Consider combining non-standard header lines and important headers like ### Return-Path: into one list; would reduce code complexity ### ### o Allow forcing processing for new email messages to begin on ### cluster boundaries (default 512 bytes) ### ### o Specify threshold for how many header lines must be seen ### ### o Rudimentary fragmentation support buffers the entire image file ### during the second pass, mostly to allow file processing issues ### to be ignored while the technique is worked out. This will be unacceptable for large ### image files and needs to be fixed if we don't scrap the fragmentation support. ### ############ # command line options processing module use Getopt::Long; use strict; use warnings; use Digest::MD5; # global scalars my $DEBUG=0; my $VERSION="recover-rfc2822 v0.1prealpha"; my $outputdirectory = "email-output"; my $auditfilename = "audit.txt"; my $altauditfilename = "md5-audit.txt"; my $email; my $emailcounter=0; my $imagefilename=""; my $imagefilesize=0; my $percentprocessed=0; my $previousprocessed=-1; my $pushback=""; my $line; my $sawreturnpath=0; my $numheaderlines=0; my $oldfilepos=-5; my $filepos=0; my $startbyte; my $endbyte; my $eightbitok=0; my @boundary_string_stack; my @email_coverage; my %orphan_boundaries; my %orphan_boundary_locations; my $charset; my @charset_stack; my $encoding; my @encoding_stack; my %header_charset_strings; my %body_charset_strings; # globals for tuning behavior my $maxRFC2822linelen = 1000; my $minheaderlines=5; # minimum number of header lines that must be # seen for a block to be considered an email my $clusteraligned=0; # if > 0, the beginnings of email messages must # be cluster-aligned. The value of this variable # is the cluster size. my $returnpathisfooter=1; # if > 0, then "Return-Path:" email headers always # act as a footer for the current # email msg being processed ################################################## ################################################## ### THIS LIST IS JUST A FIRST CUT--IT NEEDS WORK! ################################################## ################################################## # The following list encodes basic information about which byte values # are likely to be encountered when processing an email with a # specific charset. For email fragments with "quotable-printable" or # "base64" encoding, these charset definitions are not used. # Entries here can substantially improve email boundary detection # (because byte values outside the specified definitions trigger an # end-of-email condition.) # # Any emails with declared character sets that are not included in the # list are assumed to (possibly) contain valid 8-bit characters in the # email body. # # Many definitions could be improved, e.g., for SHIFT_JIS, where only # certain values are used for the second byte of each character. # # Matching against email message charsets is not case sensitive, but # all entries in this table should be LOWERCASE. my %charset_definitions=( # Character sets for which we don't usually expect non-printable # ASCII. The us-ascii (+ variants) can be set to 32-126, but this # will cause some emails to be truncated if special upper-range ASCII # characters are used. "iso-8859-1", "32-126+128-255", "3diso-8859-1", "32-126+128-255", "us-ascii", "32-126+128-255", "3dus-ascii", "32-126+128-255", "windows-1252", "32-126+128-255", # Character sets for which we always assume 8-bit printable values # (ASCII >= 32 + CR+LF). "iso-8859-2", "32-255", "gb2312", "32-255", "shift_jis", "32-255", "koi8-r", "32-255", "windows-1250", "32-255", # <= 127 is plain ASCII # This is a 7-bit character set which uses ASCII 32-126 + the ESCcape # character for (internal) charset switching, but the character sets # that are switched to can have complicated ranges. 32-255 seems safe. "iso-2022-jp", "27-27+32-255", # Other multi-byte character sets which use restrictive ranges for at # least some bytes. This group in particular could benefit from a # better charset definition scheme. "big5", "32-254", "big-5", "32-254", "utf-8", "0-255", ## lame--much more info if bytes eval'd separately # Single byte character sets with restrictive ranges "windows-1251", "32-126+128-151+153-255", "win-1251", "32-126+128-151+153-255", "iso-8859-6", "32-126+160-160+164-164+172-173+187-187+191-191+193-218+224-242", # arabic "iso-8859-7", "32-126+160-173+175-209+211-254", # greek "windows-1255", "32-126+128-128+130-137+139-139+145-153+155-155+160-201+203-216+224-250+253-254", # hebrew # All 8-bit values OK (for consistency) "8bit", "0-255" ); # Alerts that signal non-printable characters in email headers for # this message. If this alert is seen, then tolerate 8-bit data in # email headers. These must be valid regular expressions. my @nonprintablealerts=('^X-Amavis-Alert:.*BAD HEADER.*Non-encoded 8-bit data'); # Definitions for lines that are allowed in headers, but which don't # meet the RFC requirements. Examples include "From " lines # introduced when emails are stored. Determining these may require # some preprocessing of the disk image. These regular expressions # must be as specific as possible, otherwise email body recovery will # become desynchronized and email bodies may be truncated. # # strings | grep "From " | sort | uniq or similar can be used as # a quick pre-processing step to determine these strings. # my @allowednonstandardheaders=('From[ ]owner[\-]postfix[\-]announce.cloud9.net', 'From[ ]owner[\-]tct[\-]users.porcupine.org'); ###################################################################### ###################################################################### ###################################################################### ###################################################################### # return a copy of the input string with trailing whitespace removed # # Args: string # Returns: string with whitespace removed ###################################################################### sub chop_whitespace { my $s=shift; my $chopping=1; while (length($s) > 0 && $chopping) { $chopping=0; my $c=substr($s, length($s)-1, 1); if ($c eq " " || $c eq "\t" || $c eq "\r" || $c eq "\n") { chop $s; $chopping=1; } } return $s; } ###################################################################### # return a string from an input stream, with support for RFC 822-style # folding # # Args: type glob for file handle # Returns: string read from stream ###################################################################### sub getline_with_folding { local *FH=shift; my $s; my $s1; my $c; my $folding=1; my $eof=0; if (length($pushback)) { $s=$pushback; if ($pushback eq "\n") { $pushback=""; return $s; } } $pushback=""; # read one line $s1 = ; if ($s1) { $s = $s . $s1; } else { $eof=1; } while (! $eof && $folding) { # peek ahead to see if the next character is whitespace; if it is, # then read another line and append to the return value, otherwise # remember the character and return; the read-ahead character will # be used for the next line of input that's read $c = getc(FH); if (eof(FH)) { $eof=1; } else { if ($c && ($c eq " " || $c eq "\t")) { $s1 = ; if (! $s1) { $eof=1; } $s = $s . $c . $s1; } else { $folding=0; $pushback=$c; } } } # update global file position $filepos=tell(FH); $percentprocessed = int($filepos / $imagefilesize * 100); if ($percentprocessed > $previousprocessed) { $previousprocessed=$percentprocessed; print "Completed: $percentprocessed%\n"; } return $s; } ###################################################################### ###################################################################### # return values of boundary field and/or charset field for a Content-Type header. # undef is returned for values of fields which are not present. # # Args: input string (potentially multi-line) # Returns: ($boundary_field_value, $charset_value) ###################################################################### sub parse_contenttype { my $lin=shift; my $boundary; my $charset; my $sawcontenttype = ($lin =~ /^Content-Type:/s); # hide regular expression with match extraction unless it's needed if ($sawcontenttype) { # prefer boundary field with double-quoted value my $sawboundary = ($lin =~ /^(Content-Type:.*boundary=)["](\S+)["]/s); if ($sawboundary) { $boundary=$2; } else { # try to detect boundary field w/o double quotes $sawboundary = ($lin =~ /^(Content-Type:.*boundary=)(\S+)/s); if ($sawboundary) { $boundary=$2; } } # prefer charset field with double-quoted value my $sawcharset = ($lin =~ /^(Content-Type:.*charset=)["]([A-Za-z0-9-]+)["]/s); if ($sawcharset) { $charset=$2; } else { # try to detect charset field w/o double quotes $sawcharset = ($lin =~ /^(Content-Type:.*charset=)([A-Za-z0-9-]+)/s); if ($sawcharset) { $charset=$2; } } } return ($boundary, $charset); } ###################################################################### ###################################################################### # return value of RFC-2047-encoded charset from a Subject header. # undef is returned values of fields which are not present. # # Args: input string (potentially multi-line) # Returns: charset ###################################################################### sub parse_subjectforcharset { my $lin=shift; my $sawsubjectwithcharset = ($lin =~ /^Subject:[ \t]*\=\?([a-zA-Z0-9-]+)\?/s); my $charset=$1; return $charset; } ###################################################################### ###################################################################### # return value of encoding for a Content-Transfer-Encoding header. # undef is returned for values of fields which are not present. # # Args: input string (potentially multi-line) # Returns: encoding ###################################################################### sub parse_contenttransferencoding { my $lin=shift; my $e; my $sawtransferencoding = ($lin =~ /^Content-Transfer-Encoding:/s); # hide regular expression with match extraction unless it's needed if ($sawtransferencoding) { # prefer double-quoted value $sawtransferencoding = ($lin =~ /(^Content-Transfer-Encoding:[ \t]*)["]([A-Za-z0-9-]+)["]$/s); if ($sawtransferencoding) { $e=$2; } else { # try to detect field value w/o double quotes $sawtransferencoding = ($lin =~ /(^Content-Transfer-Encoding:[ \t]*)([A-Za-z0-9-]+)$/s); if ($sawtransferencoding) { $e=$2; } } } return $e; } ###################################################################### ###################################################################### # determine if the interval [$startbyte, $endbyte] overlaps any interval # stored in @email_coverage. # # ATTENTION, S'IL TE PLAIS: # # ! @email_coverage is assumed to be sorted by beginning byte values ! # # # Args: $startbyte, $endbyte # Returns: 1 if there's overlap, otherwise 0 ###################################################################### sub covered { my $begin=shift; my $end=shift; my $i; my $b=0; my $e; my $ref; my $len=scalar(@email_coverage); my $covered=0; for ($i=0; $i < $len && ! $covered && $b <= $end; $i++) { $ref=$email_coverage[$i]; $b=${$ref}[0]; $e=${$ref}[1]; $covered = ($begin >= $b && $begin <= $e || $end >= $b && $end <= $e); } return $covered; } ###################################################################### ###################################################################### # determine if a string is composed entirely of bytes whose values # fall within the range specified by the the parameter charset # (+ CR / LF and TAB). # # Args: charset, string to check # Returns: 1 if all characters within limits, otherwise 0 ###################################################################### sub all_chars_in_charset { my $charset=lc(shift); my $s=shift; my $charsetdef = $charset_definitions{$charset}; my $ret=1; my $i=0; my $len=length($s); my $c; my $n; my $begin; my $end; my @begins; my @ends; my @ranges; if (! defined($charsetdef) || $charsetdef eq "") { print "WARNING: using default 8bit character set in all_chars_in_charset()...\n"; $charsetdef = $charset_definitions{"8bit"}; } (@ranges) = split(/\+/, $charsetdef); foreach $i (@ranges) { ($begin, $end) = split(/\-/, $i); push(@begins, $begin); push(@ends, $end); if (! defined($begin) || ! defined($end)) { die "Bad range in definition of charset $charset\n"; } } while ($i < $len && $ret) { $c = substr($s, $i, 1); $ret = $c eq "\n" || $c eq "\r" || $c eq "\t" || $c eq " "; if (! $ret) { $n = ord($c); for (my $j=0; $j < scalar(@begins) && ! $ret; $j++) { $ret = $n >= $begins[$j] && $n <= $ends[$j]; } } $i++; } return $ret; } ###################################################################### ###################################################################### # determine if a string is composed entirely of printable ASCII # characters (+ CR / LF) # # Args: string to check # Returns: 1 if all printable characters, otherwise 0 ###################################################################### sub all_printable_chars { my $s=shift; return all_chars_in_charset("us-ascii", $s); } ###################################################################### ###################################################################### sub preheader_processing { ################################# # PRE-HEADER processing ################################# # searching for a viable email header line: try to find a section of # an input line beginning with [a-zA-Z0-9\-]+: and with this section # of the line exceeding $maxRFC2822linelen and which contains only # printable characters. Once a line like this is found, drop into # the header processing section. The first line of an email header # block isn't necessarily preceded by a CR/LF, so do multi-line # regular expression searches and then trim away data preceding the # header tag. ################################# my $scanning_pre_header=1; my $potentialheader; my $nonstand; my $k; my $part1; my $part2; my $part3; while (defined($line) && $scanning_pre_header) { if ($DEBUG) { print "$line"; } # need to be liberal when identifying the first header line of an # email; it may not be preceded by a CR/LF. If we do see # something that looks like a header line and that passes some # basic sanity checks, trim away the portion of $line preceding # the header and then drop out of pre-header processing. We # insist that this initial header line be exclusively printable ASCII # characters--otherwise there are too many false positives. $potentialheader=0; # try non-standard header definitions first, because it's possible # that these will be less restrictive than the standard format for # headers for ($k=0; $k < scalar(@allowednonstandardheaders) && ! $potentialheader; $k++) { $nonstand='^(.*?)(' . $allowednonstandardheaders[$k] . ')(.*$)'; $potentialheader = ($line =~ /$nonstand/s); $part1=$1; $part2=$2; $part3=$3; } if (! $potentialheader) { $potentialheader = ($line =~ /^(.*?)([A-Za-z0-9][A-Za-z0-9-]+:)(.*$)/s); $part1=$1; $part2=$2; $part3=$3; } if ($potentialheader) { # get only the potential header portion of the input line $line = $part2 . $part3; # remember start position of potential email $startbyte = $filepos - length($part3) - length ($part2) - 1; } $scanning_pre_header = length($line) > $maxRFC2822linelen || ! $potentialheader || $oldfilepos == $filepos; if ($scanning_pre_header) { $line = getline_with_folding(*F); } else { # remember old file position--if header/body processing sections don't # do anything, then they didn't like what we fed them $oldfilepos=$filepos; } } if ($DEBUG) { print "**** END OF PRE-HEADER PARSING ***\n"; } } ###################################################################### ###################################################################### sub header_processing { ################################# # HEADER processing ################################# # have a viable header. Continue processing lines until we don't # see a line beginning with [a-zA-Z0-9][A-Za-z0-9-]+: and along the # way, try to learn useful info about the format of the body of the # email message, including whether the message content is 7bit, # 8bit, or binary, whether the message is multi-part (and what the # outer boundary, defined in the Content-Type header, is). # Detection of more than one "Return-Path:" header terminates header # AND body processing just before the occurrence of the # "Return-Path:", unless this behavior has been overridden. This # almost always means that we've run into another email message in # the disk image. ################################# my $scanning_header=1; my $boundary; my $nonstand; my $b; my $k; my $potentialheader; my $cs; my $enc; while (defined($line) && $scanning_header) { if ($DEBUG) { print "\"$line\""; } # check for indications that header has raw 8-bit characters if (! $eightbitok) { foreach my $alert (@nonprintablealerts) { $eightbitok = ($line =~ /$alert/s); } } $sawreturnpath += ($line =~ /^Return-Path:/s); # check non-standard header definitions $potentialheader=0; for ($k=0; $k < scalar(@allowednonstandardheaders) && ! $potentialheader; $k++) { $nonstand='^' . $allowednonstandardheaders[$k]; $potentialheader = ($line =~ /$nonstand/s); } $scanning_header = ($line =~ /^[A-Za-z0-9][A-Za-z0-9-]+:/s || $potentialheader) && length($line) <= $maxRFC2822linelen && (all_printable_chars($line) || $eightbitok) && $sawreturnpath < 2; if ($scanning_header) { # learn charset and also find boundaries for multi-part messages ($boundary, $cs) = parse_contenttype($line); if (defined($boundary)) { push(@boundary_string_stack, $boundary); if (! defined($cs)) { # use a default; need a charset to match the boundary $cs="us-ascii"; } } if (defined($cs)) { $charset=lc($cs); push(@charset_stack, $charset); $header_charset_strings{$charset}++; } $enc = parse_contenttransferencoding($line); if (defined($enc)) { $encoding=lc($enc); push(@encoding_stack, $encoding); } # we can also learn about character sets via RFC 2047-style # header line encodings, though these character sets shouldn't # carry over into body processing in compliant emails. # Unfortunately, there are emails in the wild that encode a # character set in the subject line and then omit character set # information for the message body. Search subject lines for # ?=charset? definitions and use this charset as a fallback, if # it's all there is, but only if we have a definition for it. # If there's no definition, it's not going to help with body # processing anyway. if (! defined($charset)) { $charset=parse_subjectforcharset($line); if (defined($charset)) { $charset=lc($charset); if ($DEBUG) { print "FOUND RFC 2047 STYLE CHARSET: \"$charset\"\n"; } # always log occurrence of charset $header_charset_strings{$charset}++; if (defined($charset_definitions{$charset})) { push(@charset_stack, $charset); } else { $charset=undef; } } } # if we see the most recent boundary string, then header # processing is over; let the body processing section handle # this line. An end of line check in the regexp is deliberately # omitted, because the last occurrence of a set of boundary # strings will have "--" appended. We shouldn't actually see # the boundary string while doing header processing--this is # primarily a guard against an ill-formed email which is missing # the (required) blank line between header and body. if (scalar(@boundary_string_stack)) { $b = '(^' . quotemeta("--" . $boundary_string_stack[scalar(@boundary_string_stack) - 1]) . ')(.*)$'; if ($line =~ /$b/) { $scanning_header=0; } } if ($scanning_header) { $email .= $line; if (! $potentialheader) { # non-standard header lines don't count toward the minimum # number needed for a valid email $numheaderlines++; } $line = getline_with_folding(*F); } } } if ($DEBUG) { print "**** END OF HEADER PARSING ***\n"; } } ###################################################################### ###################################################################### sub body_processing { ################################# # BODY processing ################################# # finished with header processing--try to guess where the email body # ends. The following rules are used to figure out where to stop: # # o If no headers were discovered, stop immediately. # # o For MIME multi-part messages, we should have a defined boundary # string. This string is used to delimit the components of a # multi-part message. (Unfortunately) nested multi-part messages # are possible, which means that limited header processing must also # be performed on the "body" of an email message, in order to learn # the associated boundary delimiters, charset declarations, and # content-type encodings. # # o Although it is possible for "Return-Path:" strings to occur # within email bodies, it's rare. Detection of a "Return-Path:" # header results in termination of processing for the current email # message unless this behavior is overridden using a command line # option. In some cases, this is the only reliable way to # resynchronize email recovery (and avoid missing new messages) if # pieces of an email message being processed are missing (e.g., due # to file fragmentation). # # o Use characteristics of either the declared or default charsets # to find boundaries (e.g., seeing 8-bit data in a 7-bit printable # message signifies a boundary). ################################# my $part1; my $part2; my $part3; my $nonstand; my $boundary; my $k; my $b; my $matched; my $remainder; my $potentialheader=0; my $terminalboundarystring=0; my $charset_def; my $cs; my $enc; my $scanning_body = 1; if ($sawreturnpath > 1 || $numheaderlines < $minheaderlines) { $scanning_body = 0; } while (defined($line) && $scanning_body) { if ($DEBUG) { print "==>\"$line\""; } # learn charset and also find boundaries for multi-part messages ($boundary, $cs) = parse_contenttype($line); if (defined($boundary)) { push(@boundary_string_stack, $boundary); if (! defined($cs)) { # use a default $cs="us-ascii"; } } if (defined($cs)) { $charset=lc($cs); push(@charset_stack, $charset); $body_charset_strings{$charset}++; } $enc = parse_contenttransferencoding($line); if (defined($enc)) { $encoding=$enc; push(@encoding_stack, $encoding); } # check for Return-Path: header $potentialheader = ($line =~ /^(.*)(Return-Path:.*$)/s); if ($potentialheader) { # the portion of the line before the Return-Path belongs to this # email, the rest will be considered for the next email $email .= $1; $line = $2; } else { # see if we've run across a non-standard header. Detection of # other headers (aside from Return-Path:, above) isn't done here # because there's too much potential for false positives, when # bodies contain, e.g., "Hello: How are you?" for ($k=0; $k < scalar(@allowednonstandardheaders) && ! $potentialheader; $k++) { $nonstand='^(.*?)(' . $allowednonstandardheaders[$k] . ')(.*$)'; $potentialheader = ($line =~ /$nonstand/s); $part1=$1; $part2=$2; $part3=$3; } if ($potentialheader) { # the portion of the line before the header belongs to this # email, the rest will be considered for the next email $email .= $part1; $line = $part2 . $part3; } } if (! $potentialheader) { ########################### ###### THE FOLLOWING CHARSET DETERMINATION IS INEFFICIENT; ONLY ###### NEED TO DO THIS BEFORE BODY PROCESSING STARTS AND AFTER ###### ENCOUNTERING EACH BOUNDARY STRING ########################### # figure out a reasonable character set to use if (! scalar(@charset_stack) && ! scalar(@encoding_stack)) { # if the character set stack and the encoding stack are empty, # use us-ascii; this is the default case. $charset="us-ascii"; } else { # if the current transfer encoding is base64 or quoted-printable, # then us-ascii is fine. if (scalar(@encoding_stack)&& ($encoding_stack[scalar(@encoding_stack) - 1] eq "base64" || $encoding_stack[scalar(@encoding_stack) - 1] eq "quoted-printable")) { $charset="us-ascii"; } elsif (! scalar(@charset_stack) && scalar(@encoding_stack)) { # if there is an encoding other than base64 or quotable-printable but no # charset, then use "8bit" to be safe. $charset="8bit"; } else { # otherwise look up the character set encoding. $charset_def=$charset_definitions{$charset_stack[scalar(@charset_stack) - 1]}; if (! defined($charset_def)) { # no definition, so to be safe, use "8bit". $charset="8bit"; } else { $charset=$charset_stack[scalar(@charset_stack) - 1]; } } } if ($DEBUG > 2) { print "*********************************\n"; print "********************************* Using charset \"$charset\"\n"; print "charset stack: @charset_stack\n"; print "encoding stack: @encoding_stack\n"; print "*********************************\n"; } } # detect boundary strings in email body if (scalar(@boundary_string_stack)) { $matched=0; my $choppedline = chop_whitespace($line); for ($k=0; $k < scalar(@boundary_string_stack) && ! $matched; $k++) { $b = '^(' . quotemeta("--" . $boundary_string_stack[$k]) . ')(.*)$'; $matched = ($choppedline =~ /$b/); $remainder = $2; } if ($matched) { if (defined($remainder) && $remainder eq "--") { # last of this boundary string my $popped = pop(@boundary_string_stack); # print "just popped \"$popped\", stacklen = " # . scalar(@boundary_string_stack) . "\n"; pop(@charset_stack); pop(@encoding_stack); while (scalar(@boundary_string_stack) && "--" . $popped . "--" ne $choppedline) { $popped=pop(@boundary_string_stack); pop(@charset_stack); pop(@encoding_stack); } if (! scalar(@boundary_string_stack)) { # exhausted boundary stack--signal end of body processing $terminalboundarystring=1; } } else { # not the last occurrence--just pop previous charset and encoding pop(@charset_stack); pop(@encoding_stack); } # balance the charset and encoding stacks, if necessary while (scalar(@charset_stack) < scalar(@encoding_stack)) { if ($encoding_stack[scalar(@encoding_stack) - 1] eq "base64" || $encoding_stack[scalar(@encoding_stack) - 1] eq "quoted-printable") { push(@charset_stack, "us-ascii"); } else { # just use a default charset push(@charset_stack, "8bit"); } } while (scalar(@charset_stack) > scalar(@encoding_stack)) { # charset is more important--just balance stack using "8bit" entries push(@encoding_stack, "8bit"); } } } $scanning_body = ! $terminalboundarystring && ! $potentialheader && all_chars_in_charset($charset, $line); if ($scanning_body) { $email .= $line; $line = getline_with_folding(*F); } } if ($DEBUG) { print "**** END OF BODY PARSING ***\n"; } ############################################### # EMIT email message if it passes sanity checks ############################################### # if we have something reasonable (some headers + a body), then # write the recovered email. if ($email && $numheaderlines >= $minheaderlines) { $endbyte = $startbyte + length($email) - 1; # remember coverage of this email push(@email_coverage, [$startbyte, $endbyte]); $emailcounter++; if ($DEBUG) { print "***************** START OF EMAIL MESSAGE:\n"; print "$email"; print "***************** END OF EMAIL MESSAGE\n"; } my $basefilename = sprintf("%semail-%09d-%09d.txt", ($eightbitok ? "raw8bit" : ""), $startbyte, $endbyte); my $emailfilename = $outputdirectory . "/" . $basefilename; my $logline = sprintf("%-70s\t\t%12d", $basefilename, length($email)); print LOG "$logline\n"; open(EMAIL, ">:raw",$emailfilename) || die "Couldn't write email file $emailfilename\n"; # binmode EMAIL; print EMAIL "$email"; close EMAIL; open(EMAIL, "<:raw",$emailfilename) || die "Couldn't open email file $emailfilename\n"; my $context=new Digest::MD5; $context->reset(); $context->addfile("EMAIL"); my $md5 = $context->hexdigest(); close EMAIL; print ALTLOG "$md5\t" . int($startbyte / 512) . "-" . int($endbyte / 512) . "\t" . length($email) . "\tEMAIL ($basefilename)\n"; # any leftover boundary strings for this email are now orphans; # track the filename + boundary string, charset, and encoding # stacks. Also save the current charset and encoding, because # these are in effect until the next occurrence of a boundary # string. In addition, a "flat" hash of all orphan boundaries # strings is stored, whose keys are the boundaries strings and # whose values are lists of the locations of truly orphaned # instances of the boundary string. The locations will be filled # in after a second pass over the input file (later). if (scalar(@boundary_string_stack)) { $orphan_boundaries{$basefilename}=[[@boundary_string_stack], [@charset_stack], [@encoding_stack], $charset, $encoding]; foreach $k (@boundary_string_stack) { $orphan_boundary_locations{$k} = []; } } } } ###################################################################### sub audit_log_processing { ################################# # WRITE AUDIT LOG ################################# my ($k, $j, $i); # write charset information to log print LOG "\n---------------------------------------------\n"; print LOG "\ncharset definitions used:\n"; foreach $i (keys(%charset_definitions)) { $j = $charset_definitions{$i}; print LOG "$i --> $j\n"; } print LOG "\ncharset values seen in message headers:\n"; print LOG "charset\t\t\t Occurrences\n"; print LOG "-------\t\t\t -----------\n"; foreach $i (keys %header_charset_strings) { $j=$header_charset_strings{$i}; printf(LOG "%-26s %10d\n", $i, $j); } print LOG "\ncharset values seen in message bodies:\n"; print LOG "charset\t\t\t Occurrences\n"; print LOG "-------\t\t\t -----------\n"; foreach $i (keys %body_charset_strings) { $j=$body_charset_strings{$i}; printf(LOG "%-26s %10d\n", $i, $j); } print LOG "\ncharsets (headers) which used default 8bit definition (BAD):\n"; print LOG "charset\n"; print LOG "-------\n"; foreach $i (keys %header_charset_strings) { $j=$charset_definitions{$i}; if (! defined($j)) { printf(LOG "%-26s\n", $i); } } print LOG "\ncharsets (bodies) which used default 8bit definition (BAD):\n"; print LOG "charset\n"; print LOG "-------\n"; foreach $i (keys %body_charset_strings) { $j=$charset_definitions{$i}; if (! defined($j)) { printf(LOG "%-26s\n", $i); } } # write info about orphan boundary strings in log print LOG "\n---------------------------------------------\n"; print LOG "The following information can be used to verify email fragment\n"; print LOG "processing and for manual investigation of email fragmentation.\n"; print LOG "The boundary/charset/encoding stacks are displayed with the value\n"; print LOG "on the top of the stack first.\n"; if (! scalar(keys(%orphan_boundaries))) { print LOG "** NONE **\n"; } else { foreach $i (keys(%orphan_boundaries)) { my @refs=@{$orphan_boundaries{$i}}; my @boundstack=@{$refs[0]}; my @charsetstack=@{$refs[1]}; my @encodestack=@{$refs[2]}; # $refs[3], $refs[4] aren't references, just strings my $chset = (defined($refs[3]) ? $refs[3] : ""); my $enc = (defined($refs[4]) ? $refs[4] : ""); print LOG "\n--------------------------\n"; print LOG "Filename: \"$i\", charset/encoding in effect\n"; print LOG "at end of this email fragment: \"$chset\"/\"$enc\".\n\n"; print LOG "Orphan boundary strings for this file:\n\n"; my ($b, $c, $e); while ($b = pop(@boundstack)) { if (scalar(@charsetstack)) { $c = pop(@charsetstack); } else { $c = ""; } if (scalar(@encodestack)) { $e = pop(@encodestack); } else { $e = ""; } print LOG sprintf("%s with charset \"%s\" and encoding \"%s\".\n", $b, $c, $e); print LOG "Locations:\n"; my @locs = @{$orphan_boundary_locations{$b}}; if (! scalar(@locs)) { print LOG "NOT FOUND.\n"; } else { for ($k=0; $k < scalar(@locs); $k++) { my $start = ${$locs[$k]}[0]; my $index = ${$locs[$k]}[1]; print LOG sprintf("%20d %s (line # %d [indexed from 0])\n", $start, ($index < 0) ? "TERMINAL" : " ", abs($index)); } } } } } print LOG "\n---------------------------------------------\n"; } ###################################################################### ###################################################################### sub fragmentation_processing { ###################################################################### ## ** RUDIMENTARY FRAGMENTATION PROCESSING ** ## ## Needs significant work--currently buffers the entire image file to ## avoid buffering issues as the technique is worked out. ###################################################################### my $i; my $j; my $k; if (scalar(keys( %orphan_boundary_locations))) { # rewind the input file and discover locations of orphaned boundary # strings that lie outside recovered email boundaries. seek(F, 0, 0) || die "Couldn't rewind input file.\n"; # suck up the entire input file; we don't need the whitespace folding # support used in the first pass, because boundary lines must always # be on a separate line. print "Fragmentation processing beginning. This may take a while...\n"; print "Reading input file...\n"; my @lines=; print "Read " . scalar(@lines) . " lines, processing...\n"; print "# of unique, potentially orphaned boundary strings: " . scalar(keys(%orphan_boundary_locations)) . "\n"; print "Discovering locations of orphaned boundary strings...\n"; $filepos=0; $previousprocessed=-1; ################################################################### # Phase 1: search input file for all orphaned boundary strings and # remember positions ################################################################### for ($i=0; $i < scalar(@lines); $i++) { $percentprocessed = int($i / scalar(@lines) * 100); if ($percentprocessed > $previousprocessed) { $previousprocessed=$percentprocessed; print "Completed: $percentprocessed%\n"; } $line=$lines[$i]; $startbyte=$filepos; foreach $k (keys(%orphan_boundary_locations)) { my $match=0; my $terminal=0; my $locsref = $orphan_boundary_locations{$k}; my $b = '(^' . quotemeta("--") . $k . ')(.*)$'; if ($line =~ /$b/) { $match=1; $endbyte = $startbyte + length($1) + length($2); if ($2 eq "--") { # terminal occurrence of this boundary string $terminal=1; } } # not an orphaned instance if it lies within a recovered email $match = $match && ! covered($startbyte, $endbyte); if ($match) { # mark terminal occurrences by negating the index my $index = ($terminal ? -1 * $i : $i); # remember file position and index of match push(@{$locsref}, [$startbyte, $index]); } } $filepos += length($line); } ################################################################### # Phase 2: use orphaned boundary string locations to do forward # and backward carving from the location of each orphaned boundary # string. # # [ This phase can discover new boundary strings (e.g., for # "tree-like" multi-part emails) which could then discover new # orphaned boundary stringss... there's a point of diminishing # return; we don't currently do recursive fragmentation # processing] ################################################################### ###### LIMITATIONS: We currently support recovery of ###### bi-fragmented emails, which is common. When applied to ###### emails with "holes" or that are fragmented into multiple ###### pieces, full recovery will fail. The more complex cases ###### need to be added after an extended period of drinking wine ###### and eating very smelly, expensive French cheese on the ###### balcony. print "Processing fragments associated with orphaned boundary strings...\n"; # for each file with orphaned boundary strings... foreach $i (keys(%orphan_boundaries)) { print "Processing fragments for \"$i\"...\n"; # retrieve info about orphaned boundary strings, charset and # encoding stacks and locations of orphaned boundary strings my @refs=@{$orphan_boundaries{$i}}; my @boundstack=@{$refs[0]}; my @charsetstack=@{$refs[1]}; my @encodestack=@{$refs[2]}; # $refs[3], $refs[4] aren't references, just strings my $chset = (defined($refs[3]) ? $refs[3] : ""); my $enc = (defined($refs[4]) ? $refs[4] : ""); my @locs; # skip boundary strings in stack which weren't located $j=0; @locs = @{$orphan_boundary_locations{$boundstack[$j]}}; while (! scalar(@locs) && $j < scalar(@boundstack)) { @locs = @{$orphan_boundary_locations{$boundstack[$j]}}; $j++; } if ($j == scalar(@boundstack)) { print "No boundary strings found, skipping this file.\n"; next; } # do simple sanity check on orphaned boundary strings that were # located to make sure this email is bifragmented; more complex # cases aren't currently handled. my $bifragmented=1; my $endindex = abs(${$locs[scalar(@locs)-1]}[1]); $endbyte = ${$locs[scalar(@locs)-1]}[0] + length($lines[$endindex]); my $startindex; for ( ; $j < scalar(@boundstack) && $bifragmented; $j++) { @locs = @{$orphan_boundary_locations{$boundstack[$j]}}; for ($k=scalar(@locs) - 1; $k >= 0 && $bifragmented; $k--) { $startindex = abs(${$locs[$k]}[1]); $startbyte = ${$locs[$k]}[0]; $bifragmented = $startindex <= $endindex; } } if (covered($startbyte, $endbyte)) { print "Email fragment overlaps previously recovered email, skipping this file.\n"; next; } if (! $bifragmented) { print "Email fragment is not bifragmented, skipping this file.\n"; next; } # extend $startindex backward as long as charset permits and we # don't overlap a previously recovered email while ($startindex - 1 >= 0 && all_chars_in_charset($chset, $lines[$startindex - 1]) && ! covered($startbyte - length($lines[$startindex - 1]), $endbyte)) { $startindex--; $startbyte -= length($lines[$startindex]); } # EMIT email fragment # there are sometimes duplicate boundary strings--is it better not to cover the # recovered fragments?? # remember coverage of this fragment # push(@email_coverage, [$startbyte, $endbyte]); # $emailcounter++; my $basefilename = sprintf("FRAG-%09d-%09d-FOR-%s", $startbyte, $endbyte, $i); my $emailfilename = $outputdirectory . "/" . $basefilename; my $logline = sprintf("%-70s\t\t%12d", $basefilename, ($endbyte - $startbyte)); print LOG "$logline\n"; open(EMAIL, ">:raw",$emailfilename) || die "Couldn't write email file $emailfilename\n"; # binmode EMAIL; for ($j=$startindex; $j <= $endindex; $j++) { print EMAIL $lines[$j]; } close EMAIL; open(EMAIL, "<:raw",$emailfilename) || die "Couldn't open email file $emailfilename\n"; my $context=new Digest::MD5; $context->reset(); $context->addfile("EMAIL"); my $md5 = $context->hexdigest(); close EMAIL; print ALTLOG "$md5\t" . int($startbyte / 512) . "-" . int($endbyte / 512) . "\t" . ($endbyte - $startbyte) . "\tEMAIL ($basefilename)\n"; } print "Done.\n"; } else { print "No orphaned boundary strings, fragmentation processing skipped.\n"; } } ##################################################################### ##################################################################### ##################################################################### ## START OF MAIN ##################################################################### ##################################################################### # process options my $result = GetOptions("o=s" => \$outputdirectory, "n=i" => \$maxRFC2822linelen, "f=s" => \$imagefilename); if ($imagefilename eq '') { die "Usage: recover-rfc-2822.pl -f [-n ] [-o ]\n"; } # open image file open(F, "<:raw", $imagefilename) || die "Can't open image file.\n"; # remember file size $imagefilesize = -s $imagefilename; #my @layers = PerlIO::get_layers(F); #print "@layers\n"; #binmode F; #@layers = PerlIO::get_layers(F); #print "@layers\n"; # create output directory mkdir($outputdirectory) || die "Can't create output directory--does it already exist?\n"; # open log file in output directory open LOG, ">" . $outputdirectory . "/" .$auditfilename || die "Can't open log file.\n"; # open log file in output directory open ALTLOG, ">" . $outputdirectory . "/" .$altauditfilename || die "Can't open alt log file.\n"; print LOG "Image file processed by $VERSION: \"$imagefilename\"\n\n"; my $logline = sprintf("%-70s\t\t%12s", "Filename", "Length"); print LOG "$logline\n"; $logline = sprintf("%-70s\t\t%12s", "--------", "------"); print LOG "$logline\n"; $line = getline_with_folding(*F); while (defined($line)) { # reset globals before processing next email $email=undef; $eightbitok=0; $numheaderlines=0; $charset=undef; $encoding=undef; $sawreturnpath=0; @boundary_string_stack=(); @charset_stack=(); @encoding_stack=(); # try to process one email preheader_processing(); header_processing(); body_processing(); } fragmentation_processing(); audit_log_processing(); print "Results in $outputdirectory, audit log is \"$outputdirectory/$auditfilename\".\n"; print "MD5s and sector coverage in \"$outputdirectory/$altauditfilename\".\n"; close F; close LOG; close ALTLOG;