// Lodovico Marziale
// 06/14/07
//
// aviparse attempts to do intelligent carving of
// avi files from a disk image file, based on structures
// specific to such files at known relative offsets to one
// another.
//
// Attempts to handle (lightly) fragmented files by 
// recognizing data blocks which do not appear within
// a known found file.
//
// Usage: ./aviparse <imagefile>
//

// NOTE: parsed sizes in avi files do not include the size of
// the fourcc or the size if the 4-byte size number 
// Headers generally exist as 4-byte fourcc followed by 4-bytes
// element size and sometimes a 4-byte string; accounting for 
// this is the reason for all of the +4s and  +8s and +12s scattered about.
//
// TODO Fix this.

// NOTE: many avi creators disagree about the size number
// in the main RIFF????AVI header. For some it is the full size 
// of the file minus 8 bytes (for RIFF????). For some it is as
// above minus the index. For some it does not include rounding
// of the file's size up to the nearest sector boundary.
// Here, we will recover up to the idx1 + idx1_size rounded
// up to the nearest sector boundary, or size in the RIFF header
// plus 8 rounded up to the nearest sector boundary, whichever is larger.
//



#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>

#define TRUE 1
#define FALSE 0

#define SECTOR_SIZE 512
#define MAX_AVI_SIZE 2147483647 // 2GB - 1

#define NUMSIGS		7
#define MAXSIGLEN 	15  // seems safe for now


#define RIFFA 		0
#define LISTH 	1
#define AVIH 		2	 
#define LISTS		3
#define JUNK		4
#define LISTM 	5
#define IDX1 		6


//#define STRH 		7
//#define STRF 		8
//#define INDX 		9
//#define RIFFAX 		10
//#define LISTR		11

// A FOURCC (four-character code) is a 32-bit unsigned 
// integer created by concatenating four ASCII characters. 
// For example, the FOURCC 'abcd' is represented on a 
// Little-Endian system as 0x64636261
typedef unsigned long 	FOURCC;
typedef unsigned long 	DWORD;
typedef unsigned short WORD;
typedef unsigned char 	BYTE;


// main header contains global information for the entire AVI file
struct _avimainheader {
    FOURCC fcc;		// 'avih'
    DWORD  cb;		// size minus 8 bytes
    DWORD  dwMicroSecPerFrame;
    DWORD  dwPaddingGranularity;	// pad to multiples of this size; normally 2K for cd-rom
    DWORD  dwMaxBytesPerSec;
    DWORD  dwFlags;
    DWORD  dwTotalFrames;
    DWORD  dwInitialFrames;
	// There is one strl - List for each stream. If the number of strl - Lists inside the hdrl - List
	// is different from MainAVIHeader::dwStreams, a fatal error should be reported.
    DWORD  dwStreams;
    DWORD  dwSuggestedBufferSize;
    DWORD  dwWidth;
    DWORD  dwHeight;
    DWORD  dwReserved[4];
}; 


// describes an AVI 1.0 index ('idx1' format). New 
// AVI files should use an AVI 2.0 index ('indx' format)
struct _avioldindex {
   FOURCC  fcc;
   DWORD   cb;
   struct _avioldindex_entry {
      DWORD   dwChunkId;
      DWORD   dwFlags;
      DWORD   dwOffset;
      DWORD   dwSize;
  } aIndex[];
};


// returns true if index exists in index_list, where list is of length next_free
int sig_at(unsigned long index, unsigned long *index_list, int next_free) {
	int result = FALSE;
	int i;
	for(i =0; i < next_free; i++) {
		if(index_list[i] == index) {
			result = TRUE;			
			index_list[i] = 0;
		}
	}
	return result;
}


// parses an unsigned long from the beginning of buf
unsigned long parse_size(char *buf) {
	unsigned long *long_buf = (unsigned long *)buf;
	return long_buf[0];
}


// find the next file fragment by searching sequentially thru index_list 
// for a signature at the proper offset within a 512-byte sector
unsigned long get_next_fragment(unsigned long *index_list, int next_free, unsigned long index) {
	
	unsigned long result = FALSE;
	unsigned long target_offset = index % SECTOR_SIZE;
	unsigned long offset;
	int i;
	for(i = 0; i < next_free; i++) {
		offset = index_list[i] % SECTOR_SIZE;
		if(offset == target_offset) {
			result = index_list[i];
			index_list[i] = 0;
		}
	}
	return result;
}


// parses a stream list from current in buf, returns the positionof the byte
// after the parsed stream
unsigned long parse_stream(char *buf, unsigned long current, unsigned long **sig_indexes, int *next_free) {
	
	unsigned long test;
	if(sig_at(current, sig_indexes[LISTS], next_free[LISTS])) {
		unsigned long lists_size = parse_size(buf + current + 4);
		current += lists_size + 8; // always have to add in for the fourcc and the size field
		// check for junk padding
		while(sig_at(current, sig_indexes[JUNK], next_free[JUNK])) {
			current += parse_size(buf + current + 4) + 8;
		}
	}
	else {		
		// search sig_indexes for a sig at an appropriate index into a sector
		test = get_next_fragment(sig_indexes[LISTS], next_free[LISTS], current);
		// if we couldn't find the remainder of the file
		if(test == FALSE) {
			// print out end of fragment index followed by "f" so we know it's a fragment
			printf("%lu f", (current + (SECTOR_SIZE -  (current % SECTOR_SIZE))) / SECTOR_SIZE);
			return 0;
		}
		else {
			printf("%lu,", (current + (SECTOR_SIZE - (current % SECTOR_SIZE))) / SECTOR_SIZE);			
			current = test;
			printf("%lu-", (current - (current % SECTOR_SIZE)) / SECTOR_SIZE);
			while(sig_at(current, sig_indexes[JUNK], next_free[JUNK])) {
				current += parse_size(buf + current + 4) + 8;
			}
		}
	}
	return current;
}


// attempts to parse an avi file from a header position given the indexes
// of all signatures in the image
int parse_avi_file(char *buf, unsigned long **sig_indexes, int *next_free, unsigned long index) {
	
	unsigned long current = index;
	unsigned long test;
	unsigned long file_end;
	
	printf("%lu-", (current) / SECTOR_SIZE);
	unsigned long riffa_size = parse_size(buf + current + 4);
		
	unsigned long listh_size = 0;
	current += 12;  // riffa header is 12 bytes
	if(sig_at(current, sig_indexes[LISTH], next_free[LISTH])) {
		listh_size = parse_size(buf + current + 4);
	}
	else {
		// don't continue parsing - this should be in the first sector; if it's
		// not where it's supposed to be we probably have a false avi file
	}
	
	current += 12; // size of hdrl header
	int num_streams = 0;
	if(sig_at(current, sig_indexes[AVIH], next_free[AVIH])) {
		struct _avimainheader *avi_main = calloc(1, sizeof(struct _avimainheader));
        memcpy(avi_main, &buf[current], sizeof(struct _avimainheader)); 
		num_streams = avi_main->dwStreams;
	}
	else {		
		// don't continue parsing - this should be in the first sector; if it's
		// not where it's supposed to be we probably have a false avi file
	}
	
	current += sizeof(struct _avimainheader);
	int j;
	for(j = 0; j < num_streams; j++) {
		// this is the first construction which may exist outside the first 512-byte sector --
		// actually only past the first iteration
		current = parse_stream(buf, current, sig_indexes, next_free);
		if(current == 0) {
			return FALSE;
		}
	}
	
	// as long as we don't see the index, we should keep seeing
	// data chunks
	while(!sig_at(current, sig_indexes[IDX1], next_free[IDX1])) {
		if(sig_at(current, sig_indexes[LISTM], next_free[LISTM])) {
			unsigned long listm_size = parse_size(buf + current + 4);
			current += listm_size + 8;
		}
		else { // skip to next fragment
			printf("%lu,", (current + (SECTOR_SIZE - (current % SECTOR_SIZE))) / SECTOR_SIZE);
			test = get_next_fragment(sig_indexes[LISTM], next_free[LISTM], current);
			// if we couldn't find the remainder of the file
			if(test == FALSE) {
				test = get_next_fragment(sig_indexes[IDX1], next_free[IDX1], current);
				if(test == FALSE) {
					printf("%lu f", (current + (SECTOR_SIZE -  (current % SECTOR_SIZE))) / SECTOR_SIZE);
					return FALSE;
				}
				else {
					current = test;
					// must print out the file blocks since we've jumped.
					// align to nearest previous sector boundary
					
					printf("%lu-", (current - (current % SECTOR_SIZE)) / SECTOR_SIZE);

					unsigned long idx1_size = parse_size(buf + current + 4);
					// file end is greater of (idx1 index + idx1_size rounded up to the
					// nearest sector boundary) and (size indicated by the RIFF header
					// plus 8 bytes)
					file_end = current + idx1_size +8;
					file_end += SECTOR_SIZE - (file_end % SECTOR_SIZE);
					file_end = (file_end > riffa_size + 8) ? file_end : riffa_size + 8;  
					printf("%lu\n\n", (file_end) / SECTOR_SIZE);					
					return TRUE;
				}
			}
			else {
				current = test;
			}
		}
	}
	
	// finally (hopefully) we have the index
	unsigned long idx1_size = parse_size(buf + current + 4);
	// file end is greater of (idx1 index + idx1_size rounded up to the
	// nearest sector boundary) and (size indicated by the RIFF header
	// plus 8 bytes rounded up to the end of the sector)
	file_end = current + idx1_size +8;
	file_end += SECTOR_SIZE - (file_end % SECTOR_SIZE);
	file_end = (file_end > riffa_size + 8) ? file_end : riffa_size + 8;  
	printf("%lu\n", (file_end) / SECTOR_SIZE);
	return TRUE;	
}


// parse_avi_files should have zero-ed all signatures in sig_indexes which were used
// in complete avi files. What should be left are junk signatures and signatures which
// are part of avi fragments. We will eliminate the junk by looking at the size field; if
// it is larger than the size of the image or 2GB then we will assume that the signature
// is junk. 

// NOTE: the above method of eliminating junk using size is not good in the general
// case where the image will be certainly larger than the 4-byte size field,

void parse_file_fragments(char *buf, unsigned long buf_length, unsigned long **sig_indexes, int *next_free) {

	unsigned long test;
	unsigned long file_end;
	
	// get rid of the junk first
	int i;
	for(i = 0; i < NUMSIGS; i++) {
		int j;
		for(j = 0; j < next_free[i]; j++) {
			unsigned long length = parse_size(buf + sig_indexes[i][j] + 4);
			if((length > MAX_AVI_SIZE) || (length > buf_length)) {
				sig_indexes[i][j] = 0;
			}
		}
	}
	// for each stream_list signature attempt to parse a stream list, the movi list then index
	unsigned long current;
	int k;
	for(k = 0; k < next_free[LISTS]; k++) {
		current = sig_indexes[LISTS][k];
		if(current > 0) { // we have something here
			printf("%lu-", (current) / SECTOR_SIZE);
			
			// this is the first construction which may exist outside the first 512-byte sector --
			// actually only past the first iteration
			current = parse_stream(buf, current, sig_indexes, next_free);
			if(current == 0) {
			}
		
	
			// as long as we don't see the index, we should keep seeing
			// data chunks
			while(!sig_at(current, sig_indexes[IDX1], next_free[IDX1])) {
				if(sig_at(current, sig_indexes[LISTM], next_free[LISTM])) {
					unsigned long listm_size = parse_size(buf + current + 4);
					current += listm_size + 8;
				}
				else { // skip to next fragment
					printf("%lu,", (current + (SECTOR_SIZE - (current % SECTOR_SIZE))) / SECTOR_SIZE);
					test = get_next_fragment(sig_indexes[LISTM], next_free[LISTM], current);
					// if we couldn't find the remainder of the file
					if(test == FALSE) {
						test = get_next_fragment(sig_indexes[IDX1], next_free[IDX1], current);
						if(test == FALSE) {
							printf("%lu f", (current + (SECTOR_SIZE -  (current % SECTOR_SIZE))) / SECTOR_SIZE);
							break;
						}
						else {
							current = test;
							// must print out the file blocks since we've jumped.
							// align to nearest previous sector boundary
					
							printf("%lu-", (current - (current % SECTOR_SIZE)) / SECTOR_SIZE);

							unsigned long idx1_size = parse_size(buf + current + 4);
							file_end = current + idx1_size +8;
							file_end += SECTOR_SIZE - (file_end % SECTOR_SIZE);
							printf("%lu\n\n", (file_end) / SECTOR_SIZE);					
						}
					}
					else {
						current = test;
					}
				}
			}
	
			// finally (hopefully) we have the index
			if(sig_at(current, sig_indexes[IDX1], next_free[IDX1])) {
				unsigned long idx1_size = parse_size(buf + current + 4);
				// file end is greater of (idx1 index + idx1_size rounded up to the
				// nearest sector boundary) and (size indicated by the RIFF header
				// plus 8 bytes rounded up to the nearest sector boundary)
				file_end = current + idx1_size +8;
				file_end += SECTOR_SIZE - (file_end % SECTOR_SIZE);
				printf("%lu\n", (file_end) / SECTOR_SIZE);
			}
		}
	}
}


// Given the image file and the list of found signature indexes
// attempt to reconstruct all the avi files we can; beginning of a
// chain of calls to parse smaller and smaller (sub) units
// of avi files.
void parse_avi_files(char *buf, unsigned long length, unsigned long **sig_indexes, int *next_free) {
	
	// top level; start with main avi file signatures
	int result;
	int i;
	for(i = 0; i < next_free[RIFFA]; i++) {	
		result = parse_avi_file(buf, sig_indexes, next_free, sig_indexes[RIFFA][i]); 
		// clear the index from the lst
		sig_indexes[RIFFA][i] = 0;
	}
	parse_file_fragments(buf, length, sig_indexes, next_free);
}


// returns TRUE if the given magic signature appears at the beginning of buffer
// or FALSE otherwise; treats '?' in magic as a one-character wildcard
int magic_here(char *buffer, const char *magic, int magic_len) {
	int result = FALSE; 
	int i = 0;
	while(((buffer[i] == magic[i]) || (magic[i] == '?'))  && (i < magic_len)) {
		i++;
	}
	if(i == magic_len) {
		result = TRUE;
	}
	return result;
}  


// Search for and record all relevant avi signatures/structures in an image file.
// For now we just use arrays to record the positions of the relevant signatures;
// arrays statically allocated to hold 500 elements per signature.
void parse(char *buf, unsigned long length, unsigned long **sig_indexes) {
	
	// for each of the NUMSIGS arrays of found signatures, the next free
	// array index
	int *next_free = calloc(NUMSIGS, sizeof(int));
	
	char *sigs[NUMSIGS];
	sigs[RIFFA] =	"RIFF????AVI ";  // beginning of an avi file
	sigs[LISTH] =	"LIST????hdrl"; 
	sigs[AVIH] =  	"avih";	// avi main header signature -- struct follows
	sigs[LISTS] =	"LIST????strl";
	sigs[JUNK] =	"JUNK";	// demarcates padding junk
	sigs[LISTM] =	"LIST????movi"; 
	sigs[IDX1] = 	"idx1"; // beginning of the (optional) avi index -- struct follows
	
	unsigned long i;
	// for each byte
	for(i =0; i  < length; i++) {
		int j;
		// iterate thru sigs
		for(j = 0; j < NUMSIGS; j++) {
			// if a sig is here
			if(magic_here(buf + i, sigs[j], strlen(sigs[j]))) {
				// put index of the found sig in sig_indexes
				sig_indexes[j][next_free[j]] = i;
				next_free[j] += 1;
			}
		}
	}
	parse_avi_files(buf, length, sig_indexes, next_free);
}


int main(int argc, char *argv[]) {
	
	if(argc != 2) {
		printf("Usage: ./aviinfo <zipfile_candidate>\n");
		exit(1);
	}
	
	// make sure we can open file
	char *filename = argv[1]; 
	FILE *image;
	if((image = fopen(filename, "rb")) == NULL) {
		printf("Couldn't open input file: %s for reading.\n", filename);
		exit(1);
	}
	
	// get file size	
	struct stat stbuf;
	if(stat(filename, &stbuf) != 0) {
		printf("Couldn't stat input file: %s.\n", filename);
		exit(1);
	}
	unsigned long f_size = (unsigned long)stbuf.st_size;

	// get the bytes
	char *bytes = malloc(sizeof(char) * f_size);
	fread(bytes, 1, f_size, image);
	
	// allocate memory for signature indexes; we will statically
	// allocate enough room to find 500 of each -- this is a first try
	int num_sigs = 12;
	int max_indexes_per_sig = 500;
	unsigned long **sig_indexes = calloc(num_sigs, sizeof(unsigned long *));
	int i;
	for(i = 0; i < num_sigs; i++) {
		sig_indexes[i] = calloc(max_indexes_per_sig, sizeof(unsigned long));
	}
	
	// pass to verify
    parse(bytes, f_size, sig_indexes);

	return 0;    
}

