// Rippers for data-only chunks.
// Because the data-only chunks don't have any complex interdependencies (in
// the cases we encounter, at least), this is greatly simplified: coalescer and
// dumper all in one.
// [But note DEFLATE streams - we can only get the full stream by 'expanding
// outwards', which this doesn't handle.]

#ifndef __DATA_RIPPER
#define __DATA_RIPPER

#include "data_verifier.cc"
#include "mail_header_verifier.cc"
#include "tree.cc"
#include "chunk.cc"
#include <list>
#include <iterator>

using namespace std;

class data_ripper {
	protected:
		string short_format_name, long_format_name;
	public:

		string get_identifier() const { return(short_format_name); }
		string get_long_identifier() const { return(long_format_name); }

		virtual bool can_dump_this(const tree<structured_chunk> & 
				to_check) const = 0;

		virtual bool coalesce (
				list<tree<structured_chunk> >::iterator begin,
				list<tree<structured_chunk> >::iterator pos,
				list<tree<structured_chunk> >::iterator end) 
			= 0;

		virtual ostream_iterator<char> dump_tree(const
				tree<structured_chunk> & to_dump,
				ostream_iterator<char> start_here) = 0;
};

class mail_ripper : public data_ripper {
	private:
		data_iterator rewind_line(data_iterator first_byte, 
				data_iterator last_byte, int threshhold);
	public:

		mail_ripper();

		bool can_dump_this(const tree<structured_chunk> &
				to_check) const;

		bool coalesce(list<tree<structured_chunk> >::iterator begin, 
				list<tree<structured_chunk> >::iterator pos, 
				list<tree<structured_chunk> >::iterator end);

		ostream_iterator<char> dump_tree(const
				tree<structured_chunk> & to_dump, 
				ostream_iterator<char> start_here);

};

data_iterator mail_ripper::rewind_line(data_iterator first_byte, data_iterator
		last_byte, int threshhold) {

	// Used for coalescing header cases where part of the header resides
	// in one chunk, and part in the next.
	// We go from the last byte backwards until either we hit threshhold
	// or first_byte. We don't care about the first byte we read off
	// because that may be a return from the last line.
	
	int counter = 0;

	last_byte--;

	while (last_byte != first_byte && counter++ < threshhold && *last_byte
			!= 0x0a)
		last_byte--;

	return(++last_byte);
}

mail_ripper::mail_ripper() {
	short_format_name = "mail";
	long_format_name = "SMTP/RFC 822 mail text";
}

bool mail_ripper::can_dump_this(const tree<structured_chunk> & to_check) const{
	return (to_check.get_value().get_type() == DATA_ONLY_MAIL_HDR /*||
			to_check.get_value().get_type() == DATA_ONLY_MAIL_BODY*/);
}

bool mail_ripper::coalesce(list<tree<structured_chunk> >::iterator begin,
		list<tree<structured_chunk> >::iterator pos,
		list<tree<structured_chunk> >::iterator end) {

	list<tree<structured_chunk> >::iterator origpos = pos;

	// If it doesn't start at a mail header, we want none of it!
	if (pos->is_redundant()) return(false);
	if (pos->get_value().get_type() != DATA_ONLY_MAIL_HDR) return(false);

	// Make a tree of the header and any bodies we can find after it.
	// (May cause a problem if we falsely classify too much as mail..
	//  fix later.)
	// TODO: If the previous chunk's last member ends in 0a 0a 0a or has 
	// some header prefix we recognize, then combine the two.
	// The easier way is to check the last bit of the data of our last
	// added member. If the last bytes are 0a 0a 0a, then we allow a new
	// HEADER to be added.
	tree<structured_chunk> replace_with(*pos++);
	bool allow_header = false;
	mailhdr_verifier check_straddling_headers;
	while (pos->get_value().get_type() == DATA_ONLY_MAIL_BODY || 
			(pos->get_value().get_type() == DATA_ONLY_MAIL_HDR && 
			 allow_header) && pos != end) {
		pos->set_redundancy(true);

		// check for mbox terminator
		// NOTE: Doesn't use advance.
		data_iterator thischunk_start = pos->get_value().get_data(
				"DONLY_data").start;
		data_iterator thischunk_end = thischunk_start + pos->
			get_value().get_data("DONLY_data").length;
		if (*(--thischunk_end) == 0x0a && *(--thischunk_end) == 0x0a &&
				*(--thischunk_end) == 0x0a)
			allow_header = true;
		else {
			if ((check_straddling_headers.verify_data(thischunk_start, rewind_line(thischunk_start, thischunk_end, 200), thischunk_end, 200, 1).
						outcome == VO_SUCCESS))
				allow_header = true;
			else
				allow_header = false;
		}
	/*		mailhdr_verifier::verify_data(data_iterator beginning,
					                data_iterator pos, data_iterator end, int max_length,
							                double ancestral_score) const 
			allow_header = false;*/

		replace_with.subordinates.push_back(*pos++);

	}

	replace_with.set_as_combined();
	*origpos = replace_with;

	return(true);
}

ostream_iterator<char> mail_ripper::dump_tree(const
		tree<structured_chunk> & to_dump, ostream_iterator<char> 
		start_here) {

	// Rather easy: first dump the top (which is the header), then all
	// the subordinates from front to back.
	
	ostream_iterator<char> pos = start_here;
	
	pos = to_dump.get_value().dump_to_stream(pos);

	for (list<tree<structured_chunk> >::const_iterator subord_pos =
			to_dump.subordinates.begin(); subord_pos !=
			to_dump.subordinates.end(); ++subord_pos)
		pos = subord_pos->get_value().dump_to_stream(pos);

	return(pos);
}

#endif
