﻿// LPTextFileDiff
// version V1.1 (August 2009)
// by Luc Pattyn, http://www.perceler.com/
//
// a small class to compare two rather similar text files and list their differences
// as line-edit instructions (remove line, insert line, copy line)
//

#define WITHOUT_INPUT_LOGGING		// choose WITH or WITHOUT (normally WITHOUT)
#define WITHOUT_DETAILS_LOGGED		// choose WITH or WITHOUT (normally WITHOUT)
#define WITHOUT_LOG_LCS_MATRIX		// choose WITH or WITHOUT (normally WITHOUT)
#define WITHOUT_LOG_WINDOWSIZES		// choose WITH or WITHOUT (normally WITHOUT)
#define WITHOUT_INTERMEDIATE_TIMING	// choose WITH or WITHOUT (normally WITHOUT)

using System;
using System.Collections.Generic;	// List<>
using System.Diagnostics;			// Stopwatch
using System.IO;					// File, StreamWriter
using System.Windows.Forms;			// Application

namespace LPMeanAndLean {
	public class LPTextFileDiff {
		private string file1;
		private string file2;
		private int windowSizeOrig;
		private int threshold;
		private bool gcCollectFlag;
		private StreamWriter streamWriter;
		private bool logMatchingLines;

		private int lastReport;
		private int linesMatched;
		private int linesRemoved;
		private int linesInserted;

		public LPTextFileDiff(string file1, string file2, int windowSizeOrig, int threshold, 
			bool gcCollectFlag, StreamWriter streamWriter, bool logMatchingLines) {
			this.file1=file1;
			this.file2=file2;
			this.windowSizeOrig=windowSizeOrig;
			this.threshold=threshold;
			this.gcCollectFlag=gcCollectFlag;
			this.streamWriter=streamWriter;
			this.logMatchingLines=logMatchingLines;
		}

		public string Compare() {
			string report="";
			int minimumMatchingLines=int.MaxValue;
			Document.WindowSize=windowSizeOrig;		// awake Document class
			if (File.Exists(file1) && File.Exists(file2)) {
				int windowSize=windowSizeOrig;
				int windowSizeMax=windowSizeOrig*256;
#if WITH_INTERMEDIATE_TIMING
				Stopwatch sw3=new Stopwatch();
				sw3.Start();
#endif
				using (Document doc1=new Document(file1), doc2=new Document(file2)) {
					gcCollect();
					bool slidingWindowTooSmall=false;
					for (int loop=0;;loop++) {
#if WITH_INTERMEDIATE_TIMING
						output1("---------------------------------------------------");
						output2("loop="+loop+"  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec  "+
							doc1.LineNumber);
#endif
						// if one window is empty and the other is not, fill the empty one to the same level,
						// then try and remove matching lines.
						int NW1=doc1.Count;
						int NW2=doc2.Count;
						if (NW1==0 ^ NW2==0) {
							if (NW2!=0) {doc1.ReadMoreLines(NW2); NW1=doc1.Count;}
							if (NW1!=0) {doc2.ReadMoreLines(NW1); NW2=doc2.Count;}
							for (int i=1; i<=NW1 && i<=NW2; i++) {
								int lineNumber1=doc1.LineNumber;
								int lineNumber2=doc2.LineNumber;
								string s1=doc1[0];
								string s2=doc2[0];
								if (s1!=s2) break;
								reportCopy(doc1, doc2);
							}
						}
						// if both windows are empty, read directly from the streams until
						// a non-matching pair of lines is encountered.
						// (most of this code has been copied from the internals of the Document class
						// and the reportCopy() method to improve performance.
						if (doc1.Count==0 && doc2.Count==0) {
							int lineNumber1=doc1.LineNumber;
							int lineNumber2=doc2.LineNumber;
							StreamReader sr1=doc1.StreamReader;
							StreamReader sr2=doc2.StreamReader;
							int matches=0;
							for (; ; ) {
								string s1=sr1.ReadLine();
								string s2=sr2.ReadLine();
								if (s1!=s2) {
									doc1.StoreLine(s1);
									doc2.StoreLine(s2);
									break;
								}
								if (s1==null) break;
								if (logMatchingLines) {
									if (lastReport!=3) output1("");
									lastReport=3;
									output1(string.Format("{0:D5} {1:D5} : {2}", ++lineNumber1, 
										++lineNumber2, s1));
								}
								matches++;
								if ((matches & 0x1FF)==0) gcCollect();
							}
							if (matches!=0) {
								lastReport=3;
								linesMatched+=matches;
								doc1.LineNumber+=matches;
								doc2.LineNumber+=matches;
							}
						}
#if WITH_INTERMEDIATE_TIMING
						output2("Bmore  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec  "+
							doc1.LineNumber);
#endif
						// now all identical lines have been removed from the start, start the LCS
						doc1.ReadMoreLines(0); // fill the window
						doc2.ReadMoreLines(0); // fill the window
#if WITH_INTERMEDIATE_TIMING
						output2("Emore  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec");
#endif
						// compare using LCS algorithm and matrix
						NW1=doc1.Count;
						NW2=doc2.Count;
						if (NW1+NW2<=0) break;
						details("NW="+NW1+" "+NW2);
						int[,] LCS=new int[windowSize+1, windowSize+1];
						bool[,] match=new bool[windowSize+1, windowSize+1];
						// calculate LCS matrix backwards, starting at (NW1-1,NW2-1)
						// add a dummy bottom row and right column, initialized to zero
						for (int i2=0; i2<=NW2; i2++) LCS[NW1, i2]=0;
						for (int i1=0; i1<=NW1; i1++) LCS[i1, NW2]=0;
						for (int i1=NW1-1; i1>=0; i1--) {
							for (int i2=NW2-1; i2>=0; i2--) {
								if (doc1[i1]==doc2[i2]) {
									LCS[i1, i2]=LCS[i1+1, i2+1]+1;
									match[i1, i2]=true;
								} else {
									// largest neighbour (bottom or right) without Math.Max
									LCS[i1, i2]=LCS[i1, i2+1]>LCS[i1+1, i2]?LCS[i1, i2+1]:LCS[i1+1, i2];
								}
							}
						}
#if WITH_LOG_LCS_MATRIX
						showMatrix(NW1, NW2, LCS, match);
#endif
						// now "backtrack" forward and emit results
#if WITH_INTERMEDIATE_TIMING
						output2("LCS full  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec  "+
							doc1.LineNumber);
#endif
						int matchingLines=LCS[0, 0];	// bottom left element equals number of matching lines
#if WITH_LOG_WINDOWSIZES
						output2("matchingLines="+matchingLines);
#endif
						if (matchingLines<threshold && windowSize<windowSizeMax) {
							windowSize*=2;
							Document.WindowSize=windowSize;
#if WITH_LOG_WINDOWSIZES
							output2("INFO: window size doubled to "+windowSize+" lines");
#endif
							continue;
						}
						// we won't grow the window any further
						if (matchingLines<minimumMatchingLines) minimumMatchingLines=matchingLines;
						if (matchingLines==0) {
							// not a single match, so output one line of each, and try again
							if (doc1.Count!=0) reportRemoval(doc1);
							if (doc2.Count!=0) reportInsertion(doc2);
							if (doc1.Eof || doc2.Eof) break;
							if (!slidingWindowTooSmall) {
								slidingWindowTooSmall=true;
								output2("ERROR: difference too large for sliding window");
							}
						} else {
							for (int i1=0, i2=0; ; ) {
								// first test diagonal step, as this corresponds to a line match;
								// if no match try either an insert or a delete
								if (match[i1, i2]) {
									reportCopy(doc1, doc2);
									i1++;
									i2++;
									if (--matchingLines<=0) break;	// no more matches in current window
								} else if (matchingLines==LCS[i1, i2+1]) {
									reportInsertion(doc2);
									i2++;
								} else {
									reportRemoval(doc1);
									i1++;
								}
							}
						}
#if WITH_INTERMEDIATE_TIMING
						output2("LCS done  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec  "+
							doc1.LineNumber);
#endif
						LCS=null;
						match=null;
						gcCollect();
						//Thread.Sleep(0);
					}
#if WITH_INTERMEDIATE_TIMING
					output2("done  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec");
#endif
					// make sure all remaining text gets processed
					do { while (doc1.Count!=0) reportRemoval(doc1); } while (doc1.ReadMoreLines(0));
					do { while (doc2.Count!=0) reportInsertion(doc2); } while (doc2.ReadMoreLines(0));
#if WITH_INTERMEDIATE_TIMING
					output2("done  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec");
#endif
				}
				report=string.Format("\n\rStatistics: {0} lines matched, {1} removed, {2} inserted\n\r"+
					"Window sizes: initial={3}, final={4}; threshold={5}, minimum matches={6}",
					linesMatched, linesRemoved, linesInserted, windowSizeOrig, windowSize,
					threshold, minimumMatchingLines);
#if WITH_INTERMEDIATE_TIMING
				output2("done  "+sw3.Elapsed.TotalMilliseconds.ToString("N1")+" msec");
				sw3.Stop();
#endif
			}
			return report;
		}

		private void gcCollect() {
			if (gcCollectFlag) GC.Collect();
		}

		// output to stream or console, but not to both
		private void output1(string s) {
			if (streamWriter!=null) streamWriter.WriteLine(s);
			else Console.WriteLine(s);
		}

		// output to both console and stream (iff open)
		private void output2(string s) {
			if (streamWriter!=null) streamWriter.WriteLine(s);
			if (s.Length>78) s=s.Substring(0, 75)+"...";
			Console.WriteLine(s);
		}

		private void details(string s) {
#if WITH_DETAILS_LOGGED
			output2(s);
#endif
		}

		// dump the matrices (for debugging only)
		private void showMatrix(int NW1, int NW2, int[,] LCS, bool[,] match) {
			for (int i1=0; i1<=NW1; i1++) {
				string s="";
				for (int i2=0; i2<=NW2; i2++) {
					s+=" "+(match[i1, i2]?"=":" ")+LCS[i1, i2].ToString("D1");
				}
				output1(s);
			}
		}

		private void reportRemoval(Document doc1) {
			int lineNumber;
			string text=doc1.DropLine(out lineNumber);
			if (lastReport!=1) output2("");
			lastReport=1;
			output2(string.Format("{0:D5}  ---  : {1}", lineNumber, text));
			linesRemoved++;
		}

		private void reportInsertion(Document doc2) {
			int lineNumber;
			string text=doc2.DropLine(out lineNumber);
			if (lastReport!=2) output2("");
			lastReport=2;
			output2(string.Format(" +++  {0:D5} : {1}", lineNumber, text));
			linesInserted++;
		}

		private void reportCopy(Document doc1, Document doc2) {
			// warning: this code got duplicated in the constructor at the beginning of the main for loop
			int lineNumber1;
			string text=doc1.DropLine(out lineNumber1);
			int lineNumber2;
			text=doc2.DropLine(out lineNumber2);
			if (logMatchingLines) {
				if (lastReport!=3) output1("");
				output1(string.Format("{0:D5} {1:D5} : {2}", lineNumber1, lineNumber2, text));
			}
			lastReport=3;
			linesMatched++;
		}
	}

	public class Document : IDisposable {
		private static int windowSize=10;
		private static List<string> dummyList=new List<string>();
		private static StreamReader dummySR=new StreamReader(Application.ExecutablePath);
		private static int counter=0;
		private int docNumber;
		private StreamReader sr;
		private List<string> list;
		private int lineNumber=0;
		private bool eof;

		public static int WindowSize { set { windowSize=value; } }

		static Document() {
			GC.Collect();
		}

		public Document(string filename) {
			docNumber=++counter;
			list=new List<string>(windowSize);
			sr=File.OpenText(filename);
		}

		// some properties
		public StreamReader StreamReader { get { return sr; } }
		public int Count { get { return list.Count; } }
		public bool Eof { get { return eof; } }
		public int LineNumber { get { return lineNumber; } set { lineNumber=value; } }

		// to be called when a line got read using the public streamReader and needs being buffered now.
		public void StoreLine(string s) {
			if (s!=null) list.Add(s);
		}

		// fill the sliding window as much as possible
		public bool ReadMoreLines(int windowSize) {
			if (eof) return false;
			if (windowSize<=0) windowSize=Document.windowSize;
			while (list.Count<windowSize) {
				string s=sr.ReadLine();
				if (s==null) {
					eof=true;
					break;
				}
#if WITH_INPUT_LOGGING
				Console.WriteLine("........ doc"+docNumber+": "+s); 
#endif
				list.Add(s);
			}
			return true;
		}

		public string this[int index] { get { return list[index]; } }

		public string DropLine(out int lineNumber) {
			string text=null;
			text=list[0];
			list.RemoveAt(0);
			lineNumber=++this.lineNumber;
			return text;
		}

		public void Dispose() {
			if (sr!=null) sr.Close();
		}
	}
}
