// Get the hash of sectors (for our file matching problem)
// General idea: read an n-byte chunk from the input file into an sstream. Then
// get the MD5 of that through md5.cc's istream interface.
// Probably going to need a word-based n-gram count for finding "novel phrases"
// to search for, too, but one thing at a time.
// "Novel phrases": have an English dictionary. Each word has a score depending
// on its scarcity (inverse probability). Multiple rare words have their
// probabilities multiplied. Just use an ngram model trained on text.
// (Didn't do that.)

#include <set>
#include <list>
#include <vector>
#include <sstream>
#include <fstream>
#include <iostream>
#include <iterator>

#include <sys/types.h>
#include <dirent.h>

using namespace std;

#include "src_md5/md5.cc"

string md5(istream & stream) {
	string to_ret;

	char * digest_ascii;
	MD5 check_util(stream);
	digest_ascii = check_util.hex_digest();

	for (int counter = 0; counter < 32; ++counter)
		to_ret.push_back(digest_ascii[counter]);
	delete[] digest_ascii; // allocated in md5.cc
	return(to_ret);
}

string md5_whole(string fn) {
	string md5res = "";

	ifstream file(fn.c_str());
	if (!file) return(md5res);

	return(md5(file));
}

class sector_signature {
	public:
		string hash;
		int sector_number;
		int bytes_count;

		sector_signature(string hash_in, int secnum_in, int bcount_in);
		sector_signature();
		bool operator<(const sector_signature & other) const;
		bool operator>(const sector_signature & other) const;
};

sector_signature::sector_signature(string hash_in, int secnum_in, 
		int bcount_in) {
	sector_number = secnum_in;
	bytes_count = bcount_in;
	hash = hash_in;
}

sector_signature::sector_signature() {
	sector_number = -1;
	bytes_count = -1;
	hash = "";
}

// TODO: if equal, then differentiate on byte count. Very unlikely, but we
// might as well be protected.

bool sector_signature::operator<(const sector_signature & other) const {
	if (hash == other.hash)
		return (bytes_count < other.bytes_count);

	return(hash < other.hash);
}

bool sector_signature::operator>(const sector_signature & other) const {
	if (hash == other.hash)
		return (bytes_count > other.bytes_count);

	return(hash > other.hash);
}

multiset<sector_signature> get_signatures(ifstream & file, int sector_size,
		int hash_sector_size, sector_signature & last_signature) {
	multiset<sector_signature> to_return;

	istreambuf_iterator<char> in(file);

	int sector_count = 0;

	while (in != istreambuf_iterator<char>()) {
		if ((sector_count & 1023) == 1023)
			cout << "\t\t\t" << sector_count << "    \r" << flush;
		stringstream cur_sector;
		int byte_counter = 0, hashed_bytes_counter = 0;
		while (in != istreambuf_iterator<char>() &&
				byte_counter < sector_size) {
			if (byte_counter < hash_sector_size) {
				cur_sector << *in;
				hashed_bytes_counter++;
			}
			byte_counter++;
			in++;
		}

		MD5 check_util(cur_sector);
		char * digest_ascii = check_util.hex_digest();
		string digest_ascii_str;
		copy(digest_ascii, digest_ascii + 32, 
				inserter(digest_ascii_str, 
					digest_ascii_str.end()));
		delete[] digest_ascii; // allocated in md5.cc

		sector_signature signature_of_this(digest_ascii_str, 
				sector_count, hashed_bytes_counter);
		to_return.insert(signature_of_this);

		if (in == istreambuf_iterator<char>())
			last_signature = signature_of_this;

		sector_count++;
	}

	return(to_return);
}

// Must be multiset because we're going to deplete the sectors we've already
// used as we go through the successful matches. If it was just a set, then
// if two sectors had the same hash (say for a run of all zeroes), only the
// first file would go okay.

multiset<sector_signature> get_haystack_hashes(string haystack_name, int sector_size) {

	ifstream haystack_input(haystack_name.c_str());
	
	if (!haystack_input) {
		cout << "Couldn't open file " << haystack_name	<< 
			" for reading!" << endl;
		exit(-1);
	}

	cout << endl << "Reading haystack..\r" << flush;
	sector_signature haystack_last;
	multiset<sector_signature> haystack_hashes = get_signatures(
			 haystack_input, sector_size, sector_size, 
			 haystack_last);
	haystack_input.close();

	return(haystack_hashes);
}

list<int> get_occupied_sectors(string haystack_name, 
		multiset<sector_signature> & haystack_hashes, string
		needle_name, int sector_size) {
	// Returns the empty list if there was no match, otherwise removes
	// and returns the sectors in the haystack.

	list<int> nothing, sectors_used;

	cout << "Needle name: " << needle_name << endl;
	cout << "Haystack name: " << haystack_name << endl;

	ifstream needle_input(needle_name.c_str());
	if (!needle_input) {
		cout << "Couldn't open file " << needle_name << " for reading!" << endl;
		return(nothing);
	}

	cout << "Reading needle..\r" << flush;

	// Most likely, one of these signatures don't encompass 512 bytes;
	// that's a corner case we must handle.
	sector_signature needle_last;
	multiset<sector_signature> needle_hashes = get_signatures(needle_input,
			sector_size, sector_size, needle_last);
	needle_input.close();

	/*ifstream haystack_input(argv[2]);
	if (!haystack_input) {
		cout << "Couldn't open file " << argv[2] << " for reading!" <<
			endl;
		return(-1);
	}

	cout << endl << "Reading haystack..\r" << flush;
	sector_signature haystack_last;
	set<sector_signature> haystack_hashes = get_signatures(haystack_input,
			sector_size, sector_size, haystack_last);
	haystack_input.close();*/

	cout << endl;
	// Now check how many members are in either and in its intersection.
	cout << "Haystack hashes: " << haystack_hashes.size() << endl;
	cout << "Needle hashes: " << needle_hashes.size() << endl;
	
	multiset<sector_signature> in_both;

	// Subtlety alert: Set intersection copies from the first when being
	// stable, which means haystack_hashes should be first -- otherwise, it
	// just copies the sector numbers from the needle, which does ut no
	// good whatsoever.
	set_intersection(haystack_hashes.begin(), haystack_hashes.end(),
			needle_hashes.begin(), needle_hashes.end(),
			inserter(in_both, in_both.end()));
	cout << "In both: " << in_both.size() << endl;

	list<sector_signature> sectors_containing_needle;
	bool found = false;

	if (in_both.size() == needle_hashes.size() && in_both.size() != 0) {
		cout << "In both contain all hashes that are in the needle."
			<< endl;
		copy(in_both.begin(), in_both.end(), inserter(
					sectors_containing_needle, 
					sectors_containing_needle.end()));
		found = true;
	}

	// The needle may still be contained in the haystack; the needle
	// may not have a last sector of equal size to the haystack's, so
	// check that. This happens only if there's just one sector missing,
	// and that sector is the needle's last.
	
	if (in_both.size() == needle_hashes.size()-1 && needle_last.bytes_count
			!= sector_size && !found) {
		cout << "Checking if the last needle sector exists in the " <<
			"haystack (" << needle_last.bytes_count << " bytes)" <<
			endl;
		ifstream haystack_in_redux(haystack_name.c_str());
		// We care not about the race condition. The optimizer would
		// use a hash tree, but we aren't either.
		sector_signature throwaway;
		multiset<sector_signature> haystack_redux = get_signatures(
				haystack_in_redux, sector_size, needle_last.
				bytes_count, throwaway);

		if (haystack_redux.find(needle_last) != haystack_redux.end()) {
			cout << "The needle is contained in the haystack." <<
				endl;

			copy(in_both.begin(), in_both.end(), inserter(
						sectors_containing_needle,
						sectors_containing_needle.
						end()));

			sectors_containing_needle.push_back(*(haystack_redux.
						find(needle_last)));

			found = true;
		}
	}

	if (found) {

		// Okay, we have a confirmed hit. Now remove the sectors so
		// they won't cause false positives later on.
		// The ordinary sector size sectors are easy, it's the last one
		// that's difficult. We do it by looking up the sector number
		// and then removing that manually.

		for (list<sector_signature>::const_iterator pos = 
				sectors_containing_needle.begin(); pos !=
				sectors_containing_needle.end(); ++pos) {
			if (pos->bytes_count == sector_size)
				haystack_hashes.erase(haystack_hashes.find(
							*pos));
			else {
				bool erased = false;
				multiset<sector_signature>::iterator to_erase;
				for (multiset<sector_signature>::iterator
						search = haystack_hashes.
						begin(); search !=
						haystack_hashes.end() &&
						!erased; search++)
					if (search->sector_number == pos->
							sector_number) {
						erased = true;
						to_erase = search;
					}

				if (erased)
					haystack_hashes.erase(to_erase);
			}

			sectors_used.push_back(pos->sector_number);
		}

		// Since the sets don't preserve relative ordering, sort the
		// sectors so that it is obvious that the sectors are in no
		// particular order wrt the order they appear in the needle.
		// (We could order them the right way by, for each sector
		//  signature, looking up the position in needle and then in
		//  haystack, but the specs don't require us to do so here..)

		sectors_used.sort();

	/*	copy(sectors_used.begin(), sectors_used.end(), 
				ostream_iterator<int>(cout, "\t"));*/
		return(sectors_used);
	}

	return(nothing);
}

void print_sector_regions(list<int> & sectors) {

	// Say we start at position p. If p+1 is the end or is more than a
	// sector away from us, print p and go to p+1. Else just go to p+1.

	int beginning = *sectors.begin();
	int last = beginning;

	for (list<int>::const_iterator curpos = ++sectors.begin(); curpos !=
			sectors.end(); ++curpos) {
		if (*curpos == last+1)
			last = *curpos;
		else {
			if (last != beginning)
				cout << beginning << "-" << last;
			else	cout << beginning;

			beginning = *curpos;
			last = *curpos;
			cout << ", ";
		}
	}

	if (last != beginning)
		cout << beginning << "-" << last << endl;
	else	cout << beginning << endl;
}

vector<string> get_files_in_dir(DIR * directory) {

	struct dirent * this_entry;
	vector<string> files;

	do {
		this_entry = readdir(directory);
		if (this_entry != NULL) {
			string to_push = this_entry->d_name;
			// Don't include backreferences
			// TODO: Don't include directories either
			if (to_push != "." && to_push != "..")
				files.push_back(this_entry->d_name);
		}
	} while (this_entry != NULL);

	return(files);
}

vector<string> get_files_in_dir(string directory) {
	DIR * direclink = opendir(directory.c_str());

	if (direclink == NULL) {
		cout << "Couldn't open directory " << directory << endl;
		return(vector<string>(0));
	}
	
	vector<string> toRet = get_files_in_dir(direclink);
	closedir(direclink);

	return(toRet);
}

main(int argc, char * * argv) {

	if (argc < 3) {
		cout << "Usage: " << argv[0] << " [haystack file] [needle directory]"
			<< endl;
		return(-1);
	}

	string needle_dir = argv[2], haystack_name = argv[1];
	vector<string> needles = get_files_in_dir(needle_dir);
	vector<list<int> > occupied_sectors(needles.size());
	int sector_size = 512;

	multiset<sector_signature> haystack_hashes = get_haystack_hashes(
			haystack_name, sector_size);

	int counter;

	for (counter = 0; counter < needles.size(); counter++) {
		string this_needle_name = needle_dir + "/" + needles[counter];
		occupied_sectors[counter] = get_occupied_sectors(haystack_name,
			haystack_hashes, this_needle_name, sector_size);
	}

	cout << endl << endl;

	for (counter = 0; counter < needles.size(); ++counter) {
		cout << needle_dir + "/" + needles[counter] << "\t";
		cout << occupied_sectors[counter].size() << " sectors." << endl;
	}

	for (counter = 0; counter < needles.size(); ++counter) {
		if (occupied_sectors[counter].empty()) continue;

		string path = needle_dir + "/" + needles[counter];

		cout << path << "\t";
		cout << md5_whole(path) << "\t";
		print_sector_regions(occupied_sectors[counter]);
	}

	/*occupied_sectors = get_occupied_sectors(haystack_name, haystack_hashes,
			needle_name, sector_size);

	cout << "Should be 0: " << occupied_sectors.size() << endl;*/
}

