// Lodovico Marziale
// 06/14/07
//
// zipparse attempts to do intelligent carving of
// zip files from a disk image file, based on structures
// specific to such files at known relative offsets to one
// another.
//
// Attempts to handle (lightly) fragmented files by 
// recognizing data blocks which do not appear within
// a known found file. The locations of signatures and
// other metadata nearby are printed to standard out
// after the offsets of completed files are printed.
//
// Usage: ./zipparse <imagefile>
//


#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>


#define TRUE 1
#define FALSE 0

#define SECTOR_SIZE 512
#define MAX_ZIP_SIZE 2147483647 	// 2GB - 1
#define MAX_ERROR 25

#define NUMSIGS		3
#define MAXSIGLEN 	15  // seems safe for now

#define LFH 		0	// local file header
#define CDFH 	1 	// central directory file header
#define EOCD 	2 	// ned of central directory header 

struct local_file_hdr { // 26 bytes
//	unsigned long sig;
	unsigned short ver_needed;
	unsigned short flags;
	unsigned short comp_meth;
	unsigned short mod_time;
	unsigned short mod_date;
	unsigned long checksum;
	unsigned long comp_size;
	unsigned long orig_size;
	unsigned short fname_len;
	unsigned short efield_len;
//	file name (variable size)
//  extra field (variable size)
} __attribute__ ((packed));


struct cd_file_hdr { // 42 bytes
//  unsigned long sig;      // signature: 0x02014b50
  unsigned short ver_made_by;  // version made by
  unsigned short ver_needed;    // version needed
  unsigned short flags;   // bit flags
  unsigned short comp_meth;  // compression method
  unsigned long mod_date_time;  // time of last modification
  unsigned long checksum;  // checksum
  unsigned long comp_size; // compressed size
  unsigned long orig_size; // original size
  unsigned short fname_len; // length of file name
  unsigned short efield_len; // length of extra field
  unsigned short comment_len; // length of file comment
  unsigned short disk_start;   // disk number start
  unsigned short int_attrib;   // internal file attributes
  unsigned long ext_attrib;    // external file attribute
  unsigned long loc_hdr_rel_offset;    // relative offset of local header
  // file_name char[fnamlen]
  // extra_field char[extrlen]
  // file_rem char[fremlen]
} __attribute__ ((packed));


struct end_of_cd { // 18 bytes
//  unsigned long sig;       // signature: 0x06054b50
  unsigned short disk_num;  // number of this disk
  unsigned short cd_start_disk;    // number of disk with start of cd
  unsigned short num_entries_here;  // number of cd entries on this disk
  unsigned short num_entries;      // total number of cd entries
  unsigned long cd_size;    // size of cd
  unsigned long offset;    // offset of start of cd
  unsigned short comment_length; // length of ZIP file comment
  // zip_comment char[zipremlen]
} __attribute__ ((packed));


// attempts to parse unfragmented zip files
int parse_zip_file(char *buf, unsigned long **sig_indexes, int *next_free, unsigned long index) {
	
	int result = FALSE;
	unsigned long current = index;
	
	// we'll go through each LFH and try to parse out a file
	struct local_file_hdr *local_fh = calloc(1, sizeof(struct local_file_hdr));
    memcpy(local_fh, &buf[current + 4], sizeof(struct local_file_hdr));
	unsigned long target_offset = current + local_fh->comp_size + local_fh->fname_len + local_fh->efield_len + 4 + sizeof(struct local_file_hdr);
	
	int k;
	for(k = 0; k < next_free[LFH]; k++) {
		if((sig_indexes[LFH][k] > target_offset) && (sig_indexes[LFH][k] < target_offset + MAX_ERROR)) {	
			current = sig_indexes[LFH][k];
			struct local_file_hdr *l_fh = calloc(1, sizeof(struct local_file_hdr));
    		memcpy(l_fh, &buf[current + 4], sizeof(struct local_file_hdr));
			target_offset = current + l_fh->comp_size + l_fh->fname_len + l_fh->efield_len + 4 + sizeof(struct local_file_hdr);
			sig_indexes[LFH][k] = 0;
			k = 0;			
		}
	}
	int m;
	for(m = 0; m < next_free[CDFH]; m++) {
		if((sig_indexes[CDFH][m] > target_offset) && (sig_indexes[CDFH][m] < target_offset + MAX_ERROR)) {	
			current = sig_indexes[CDFH][m];
			sig_indexes[CDFH][m] = 0;
			struct cd_file_hdr *cd_fh = calloc(1, sizeof(struct cd_file_hdr));
       		memcpy(cd_fh, &buf[current + 4], sizeof(struct cd_file_hdr)); 
			target_offset = current + cd_fh->fname_len + cd_fh->efield_len + cd_fh->comment_len + sizeof(struct cd_file_hdr) + 4;
			m = 0;			
		}
	}	
	int n;
	for(n = 0; n < next_free[EOCD]; n++) {
		if((sig_indexes[EOCD][n] >= target_offset) && (sig_indexes[EOCD][n] <= target_offset + MAX_ERROR)) {
			current = sig_indexes[EOCD][n];
			sig_indexes[EOCD][n] = 0;
			struct end_of_cd *e_cd = calloc(1, sizeof(struct end_of_cd));
        	memcpy(e_cd, &buf[current + 4], sizeof(struct end_of_cd)); 
			unsigned long file_length = (current + e_cd->cd_size + 4) - index;
			printf("%lu-%lu, length: %lu\n", (index) / SECTOR_SIZE, (current + e_cd->cd_size + 4) / SECTOR_SIZE, file_length);
			result = TRUE;
			break;
		}
	}	
	return result;
}


// Given the image file and the list of found signature indexes
// attempt to reconstruct all the zip files we can; then print out 
// the structures for signatures which were not used in recovering
// complete files.
void parse_zip_files(char *buf, unsigned long length, unsigned long **sig_indexes, int *next_free) {
	
	// top level; start with main zip file signatures
	printf("\nComplete files:\n\n");
	int result;
	int i;
	for(i = 0; i < next_free[LFH]; i++) {	
		if(sig_indexes[LFH][i]  > 0) {
			result = parse_zip_file(buf, sig_indexes, next_free, sig_indexes[LFH][i]); 
			// clear the index from the list
			if(result == TRUE) {
				sig_indexes[LFH][i] = 0;
			}
		}
	}
	// now we print out structures which were not used in the completely
	// recovered files.
	printf("\nUnused pieces:\n\n");
	int j;
	for(j = 0; j < next_free[LFH]; j++) {
		if(sig_indexes[LFH][j] > 0) {
			struct local_file_hdr *local_fh = calloc(1, sizeof(struct local_file_hdr));
        	memcpy(local_fh, &buf[sig_indexes[LFH][j] + 4], sizeof(struct local_file_hdr)); 
			printf("local file header at: %lu\n", sig_indexes[LFH][j]);
			printf("ver_needed: %u\n", local_fh->ver_needed);
			printf("flags: %u\n", local_fh->flags);
			printf("comp_meth: %u\n", local_fh->comp_meth);
			printf("mod_time: %u\n", local_fh->mod_time);
			printf("mod_date: %u\n", local_fh->mod_date);
			printf("checksum: %lu\n", local_fh->checksum);
		    printf("comp_size: %lu\n", local_fh->comp_size);
			printf("orig_size: %lu\n", local_fh->orig_size);
			printf("fname_len: %u\n", local_fh->fname_len);
       		printf("exfield_len: %u\n", local_fh->efield_len);
			printf("\n"); 
		}
	}
	int k;
	for(k = 0; k < next_free[CDFH]; k++) {
		if(sig_indexes[CDFH][k] > 0) {
			 struct cd_file_hdr *cd_fh = calloc(1, sizeof(struct cd_file_hdr));
       		memcpy(cd_fh, &buf[sig_indexes[CDFH][k] + 4], sizeof(struct cd_file_hdr)); 
			printf("central directory file header at: %lu\n", sig_indexes[CDFH][k] );
			printf("ver_made_by: %d\n", cd_fh->ver_made_by);
			printf("ver_needed: %d\n", cd_fh->ver_needed);
			printf("flags: %u\n", cd_fh->flags);
			printf("comp_meth: %u\n", cd_fh->comp_meth);
			printf("mod_date_time: %lu\n", cd_fh->mod_date_time);
			printf("checksum: %lu\n", cd_fh->checksum);
		    printf("comp_size: %lu\n", cd_fh->comp_size);
			printf("orig_size: %lu\n", cd_fh->orig_size);
			printf("fname_len: %u\n", cd_fh->fname_len);
           	printf("exfield_len: %u\n", cd_fh->efield_len);
			printf("comment_len: %u\n", cd_fh->comment_len);
			printf("disk_start: %u\n", cd_fh->disk_start);
			printf("int_attrib: %u\n", cd_fh->int_attrib);
			printf("ext_attrib: %lu\n", cd_fh->ext_attrib);
	        printf("loc_hdr_rel_offset: %lu\n", cd_fh->loc_hdr_rel_offset);
			printf("\n");
		}
	}
	int m;
	for(m = 0; m < next_free[EOCD]; m++) {
		if(sig_indexes[EOCD][m] > 0) {
			 struct end_of_cd *e_cd = calloc(1, sizeof(struct end_of_cd));
        	memcpy(e_cd, &buf[sig_indexes[EOCD][m] + 4], sizeof(struct end_of_cd)); 
			printf("end of central directory at: %lu\n", sig_indexes[EOCD][m]);
        	printf("disk_num: %u\n", e_cd->disk_num);
	       	printf("cd_start_disk: %u\n", e_cd->cd_start_disk);
        	printf("num_entries_here: %u\n", e_cd->num_entries_here);
       		printf("num_entries: %u\n", e_cd->num_entries);
        	printf("cd_size: %lu\n", e_cd->cd_size);
	       	printf("offset: %lu\n", e_cd->offset);
    	   	printf("comment_length: %u\n", e_cd->comment_length);	
			printf("\n"); 	
		}
	}
}


// returns TRUE if the given magic signature appears at the beginning of buffer
// or FALSE otherwise; treats '?' in magic as a one-character wildcard
int magic_here(char *buffer, const char *magic, int magic_len) {
	int result = FALSE; 
	int i = 0;
	while(((buffer[i] == magic[i]) || (magic[i] == '?'))  && (i < magic_len)) {
		i++;
	}
	if(i == magic_len) {
		result = TRUE;
	}
	return result;
}  


void parse(char *buf, unsigned long buf_length, unsigned long **sig_indexes) {
	
	// for each of the NUMSIGS arrays of found signatures, the next free
	// array index
	int *next_free = calloc(NUMSIGS, sizeof(int));
	
	char *sigs[NUMSIGS];
	sigs[LFH] =		"\x50\x4b\x03\x04";
	sigs[CDFH] =	"\x50\x4b\x01\x02"; 
	sigs[EOCD] =  "\x50\x4b\x05\x06";
	
	unsigned long i;
	// for each byte
	for(i =0; i  < buf_length; i++) {

		// if a sig is here
		if(magic_here(buf + i, sigs[LFH], strlen(sigs[LFH]))) {

			struct local_file_hdr *local_fh = calloc(1, sizeof(struct local_file_hdr));
        	memcpy(local_fh, &buf[i + strlen(sigs[LFH])], sizeof(struct local_file_hdr)); 
				
			// We will eliminate the junk by looking at the size field; if
			// it is larger than the size of the image or 2GB then we 
			// will assume that the signature is junk. 
			unsigned long length = local_fh->comp_size;
			if((length < MAX_ZIP_SIZE) && (length < buf_length)) {
				// put index of the found sig in sig_indexes					
				sig_indexes[LFH][next_free[LFH]] = i;
				next_free[LFH] += 1;					
			}  
		}
				
		if(magic_here(buf + i, sigs[EOCD], strlen(sigs[EOCD]))) {

        	struct end_of_cd *e_cd = calloc(1, sizeof(struct end_of_cd));
        	memcpy(e_cd, &buf[i + strlen(sigs[EOCD])], sizeof(struct end_of_cd)); 
				
			unsigned long length = e_cd->cd_size;
			if((length < MAX_ZIP_SIZE) && (length < buf_length)) {
				// put index of the found sig in sig_indexes
				sig_indexes[EOCD][next_free[EOCD]] = i;
				next_free[EOCD] += 1;		
			}
		}
			
		if(magic_here(buf + i, sigs[CDFH], strlen(sigs[CDFH]))) {

        	struct cd_file_hdr *cd_fh = calloc(1, sizeof(struct cd_file_hdr));
       		memcpy(cd_fh, &buf[i + strlen(sigs[CDFH])], sizeof(struct cd_file_hdr)); 
						
			unsigned long length = cd_fh->comp_size;
			if((length < MAX_ZIP_SIZE) && (length < buf_length)) {
				// put index of the found sig in sig_indexes
				sig_indexes[CDFH][next_free[CDFH]] = i;
				next_free[CDFH] += 1;			
			}
		}
	}
	parse_zip_files(buf, buf_length, sig_indexes, next_free);
}



int main(int argc, char *argv[]) {
	
	if(argc != 2) {
		printf("Usage: ./zipparse <zipfile_candidate>\n");
		exit(1);
	}
	
	// make sure we can open file
	char *filename = argv[1]; 
	FILE *image;
	if((image = fopen(filename, "rb")) == NULL) {
		printf("Couldn't open input file: %s for reading.\n", filename);
		exit(1);
	}
	
	// get file size	
	struct stat stbuf;
	if(stat(filename, &stbuf) != 0) {
		printf("Couldn't stat input file: %s.\n", filename);
		exit(1);
	}
	unsigned long f_size = (unsigned long)stbuf.st_size;

	// get the bytes
	char *bytes = malloc(sizeof(char) * f_size);
	fread(bytes, 1, f_size, image);
	
	// allocate memory for signature indexes; we will statically
	// allocate enough room to find 500 of each -- this is a first try
	int num_sigs = 12;
	int max_indexes_per_sig = 500;
	unsigned long **sig_indexes = calloc(num_sigs, sizeof(unsigned long *));
	int i;
	for(i = 0; i < num_sigs; i++) {
		sig_indexes[i] = calloc(max_indexes_per_sig, sizeof(unsigned long));
	}
	
	// pass to verify
    parse(bytes, f_size, sig_indexes);

	return 0;    
}
