﻿//==============================================================================================
//
//	LPCodeRecognizer
//	V2.2
//	by Luc Pattyn, January 2010
//
//	trying to recognize the main programming language for a code snippet
//
//	described in my CodeProject article:
//	http://www.codeproject.com/KB/applications/LPCodeRecognizer.aspx
//
//==============================================================================================
//	History
//		V1.0 = original
//		V2.0 = much improved, language groups, better criteria
//		V2.1 = zero line test added
//==============================================================================================
//
// License
//		The author hereby grants you a worldwide, non-exclusive license to use and redistribute 
//		the files and the source code in the article in any way you see fit, provided you keep
//		this notice in place; when code modifications are applied, the notice must reflect that.
//		The author retains copyright to his accompanying article, if any; you may not republish or
//		otherwise make available the article, in whole or in part, without the prior written consent
//		of the author.
// Disclaimer
//		This work is provided “as is”, without any express or implied warranties or conditions 
//		or guarantees. You, the user, assume all risk in its use. In no event will the author 
//		be liable to you on any legal theory for any special, incidental, consequential, punitive 
//		or exemplary damages arising out of this license or the use of the work or otherwise.
//
//==============================================================================================

using System;
using System.Collections.Generic;

namespace LPCodeRecognizer {

	public class LPCodeRecognizer {

		private const string VERSION="V2.2";
		public static Action<string> Logger;

		public static string Version { get { return VERSION; } }

		public static void log(string s) {
			if (Logger!=null) Logger(s);
		}

		public static void debug(string s) {
			// log(s);
		}

		public static Language Recognize(string text, List<Language> languages, out int trust) {
			// tokenize all text from start of line to first special character
			// normal characters are: letters, digits, underscore, dollar (PHP), minus (CSS)
			// specials include all comment starters, quotation marks, equal sign, parenthesis, etc
			// every identifier, but no more than 3, and excluding the last one, are expected to be
			// keywords (e.g. public override void KeyPress(...) has 3 keyword candidates)
			// also count lines, and count some special chars separately (< > { } ; $)
			//
			// tokenizing is handled in a simple way, one character at a time
			//
			// letters include _ for most languages, # for preprocessors,
			//	$ for PHP, - for CSS, < for XML, @ for SQL, . for MSIL
			// digits are accepted to better match identifiers (although typically not part of keywords)
			string letters="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$_-#<@.";
			int nLines=0;
			int nRealLines=0;
			int nAngles=0;
			int nCurlies=0;
			int nSemiColons=0;
			int nSemiColonCommas=0;
			int nDollars=0;
			int nEquals=0;
			int nPeriods=0;
			int nCommas=0;
			int nColons=0;
			int nHexColons=0;
			int nParens=0;
			int nWords=0;
			bool bAngleLeft=false;
			bool bAngleRight=false;
			bool bCurlies=false;
			bool bSemiColons=false;
			bool bDollars=false;
			bool bEquals=false;
			bool bPeriods=false;
			bool bCommas=false;
			bool bColons=false;
			bool bHexColons=false;
			bool bParens=false;
			string word="";
			int nCharsInLine=0;
			int nRealCharsInLine=0;
			bool skipWordsInRestOfLine=false;
			bool endOfWord=false;
			string firstWord=null;
			string secondWord=null;
			int nWordsInLine=0;
			List<string> words=new List<string>();
			List<string> firstWords=new List<string>();
			List<string> secondWords=new List<string>();
			trust=0;
			if (text.IndexOfAny("\r\n".ToCharArray())<0) return null;
			text+="\n";	// append sentinel
			char prevChar=' ';
			foreach (char c in text) {
				int iLetter=-1;
				// check for end-of-line
				if (c=='\r' || c=='\n') {
					if (nWordsInLine>1 && firstWord!=null) {  // this is <b>bold</b> dfdfds
							debug("firstWord="+firstWord);
							firstWords.Add(firstWord);
					}
					if (nWordsInLine>2 && secondWord!=null) {
							debug("secondWord="+secondWord);
							secondWords.Add(secondWord);
					}
					nWordsInLine=0;
					firstWord=null;
					secondWord=null;
					if (nCharsInLine!=0) nLines++;
					if (nRealCharsInLine>1) nRealLines++;
					nCharsInLine=0;
					nRealCharsInLine=0;
					skipWordsInRestOfLine=false;
					endOfWord=true;
					if (bAngleLeft && bAngleRight) nAngles++;
					bAngleLeft=false;
					bAngleRight=false;
					if (bSemiColons) nSemiColons++;
					if (bSemiColons || bCommas) nSemiColonCommas++;
					bSemiColons=false;
					if (bDollars) nDollars++;
					bDollars=false;
					if (bCurlies) nCurlies++;
					bCurlies=false;
					if (bEquals) nEquals++;
					bEquals=false;
					if (bPeriods) nPeriods++;
					bPeriods=false;
					if (bCommas) nCommas++;
					bCommas=false;
					if (bColons) nColons++;
					bColons=false;
					if (bHexColons) nHexColons++;
					bHexColons=false;
					if (bParens) nParens++;
					bParens=false;
					// look for identifiers/keywords
				} else if (!skipWordsInRestOfLine) {
					nCharsInLine++;
					if (c!=' ' && c!='\t') nRealCharsInLine++;
					iLetter=letters.IndexOf(c);
					if (iLetter>=0) {
						word+=c;
					} else {
						endOfWord=true;
					}
				}
				if (endOfWord) {
					if (word.Length>1 && !skipWordsInRestOfLine && !word.StartsWith("-") && !word.EndsWith("-")) {
						nWordsInLine++;
						if (firstWord==null) firstWord=word;
						else if (secondWord==null) secondWord=word;
						words.Add(word);
						nWords++;
						debug("found: "+word);
					}
					if (string.Compare(word, "dim", true)==0) skipWordsInRestOfLine=true;	// VB Dim statement
					if (string.Compare(word, "if", true)==0) skipWordsInRestOfLine=true;	// VB If statement
					word="";
					endOfWord=false;
				}
				// count special characters
				switch (c) {
					case ' ':
					case '\t':
					case '\r':
					case '\n':
						break;
					case '<':
						bAngleLeft=true;
						break;
					case '>':
						// ignore "->" for C, PHP, ...
						if (prevChar!='-') bAngleRight=true;
						break;
					case '{':
					case '}':
						bCurlies=true;
						break;
					case '(':
					case ')':
						bParens=true;
						goto default;
					case ';':
						bSemiColons=true;
						break;
					case '$':
						bDollars=true;
						break;
					case '=':
						bEquals=true;
						goto default;	// don't store word as this is a lhs identifier
					case '.':
						bPeriods=true;
						goto default;
					case ',':
						bCommas=true;
						goto default;
					case ':':
						bColons=true;
						if ("0123456789abcdefABCDEF".IndexOf(prevChar)>=0) bHexColons=true;
						goto default;
					default:	// operators, parentheses, start of comment, etc.
						if (iLetter<0 && !skipWordsInRestOfLine) {
							skipWordsInRestOfLine=true;
							debug("skip from "+c+" (0x"+((int)c).ToString("X4")+") to EOL");
						}
						break;
				}
				prevChar=c;
			}
			log("Found "+nLines+" lines, "+nRealLines+" real lines, "+nWords+" words, "+nAngles+" angles, "+ nCurlies+" curlies, "+
				nSemiColons+" semi-colons, "+nDollars+" dollar signs");
			if (nLines==0) return null;
			string s="words:";
			foreach (string w in words) s+=" "+w;
			log(s);
			s="firstWords:";
			foreach (string w in firstWords) s+=" "+w;
			log(s);
			s="secondWords:";
			foreach (string w in secondWords) s+=" "+w;
			log(s);

			int dollarStart=0;
			int atStart=0;
			int lessThanStart=0;
			foreach (string ww in words) {
				if (ww.StartsWith("$")) dollarStart++;
				if (ww.StartsWith("@")) atStart++;
				if (ww.StartsWith("<")) lessThanStart++;
			}

			int keywordFactor=5;
			int hotspotFactor=5;
			int coldspotFactor=-5;
			int checksFactor=5;
			Dictionary<Language, int> scores=new Dictionary<Language,int>();
			foreach (Language lang in languages) {
				string comment=": ";
				int score1=0;
				// calculate score
				foreach (string word1 in words) {
					string word2=word1;
					if (!lang.CaseSensitive) word2=word2.ToLower();
					if (lang.Keywords.Contains(word2)) {
						score1++;
						comment+=" "+word2;
					}
				}
				int score2=0;
				foreach (string check in lang.Checks) {
					bool pass=false;
					switch (check) {
						case "none":
							pass=true;
							break;
						case "many;":
							pass=100*nSemiColons>20*nLines;
							break;
						case "many;,":
							pass=100*nSemiColonCommas>20*nLines;
							break;
						case "few;":
							pass=100*nSemiColons<50*nLines;
							break;
						case "many{}":
							pass=100*nCurlies>5*nLines;
							break;
						case "few{}":
							pass=100*nCurlies<20*nLines;
							break;
						case "few()":
							pass=100*nParens<10*nLines;
							break;
						case "few<>":
							pass=100*nAngles<10*nLines;
							break;
						case "many<>":
							pass=100*nAngles>10*nLines;
							break;
						case "few=":
							pass=100*nEquals<10*nLines;
							break;
						case "few$":
							pass=100*nDollars<5*nLines;
							break;
						case "many.":
							pass=100*nPeriods>40*nLines;		// >40% of lines hold period
							break;
						case "many,":
							pass=100*nCommas>15*nLines;			// >15% of lines hold comma
							break;
						case "few,":
							pass=100*nCommas<15*nLines;			// <15% of lines hold comma
							break;
						case "few:":
							pass=100*nColons<30*nLines;			// <30% of lines hold colon
							break;
						case "many:":
							pass=100*nColons>30*nLines;			// >30% of lines hold colon
							break;
						case "manyX:":
							pass=100*nHexColons>30*nLines;			// >30% of lines hold hexdigit+colon
							break;
						case "many$start":
							pass=100*dollarStart>20*words.Count;	// >20% of words start with $
							break;
						case "few$start":
							pass=100*dollarStart<5*words.Count;		// < 5% of words start with $
							break;
						case "many<start":
							pass=100*lessThanStart>20*words.Count;	// >20% of words start with <
							break;
						case "few<start":
							pass=100*lessThanStart<5*words.Count;		// < 5% of words start with <
							break;
						case "many@start":
							pass=100*atStart>20*words.Count;	// >20% of words start with @
							break;
						case "few@start":
							pass=100*atStart<5*words.Count;		// < 5% of words start with @
							break;
						case "manyKeywords":
							pass=100*score1>40*nLines;			// >40% of lines hold keyword
							break;
						default:
							log("ERROR: unknown check "+check);
							break;
					}
					if (pass) {
						score2++;
						comment+=" +"+check;
					} else {
						score2-=2;
						comment+=" -"+check;
					}
				}
				int score3=0;
				StringComparison sc=lang.CaseSensitive?StringComparison.Ordinal:StringComparison.OrdinalIgnoreCase;
				foreach (string hotspot in lang.Hotspots) {
					if (hotspot.Length==0) continue;
					int nHotspot=0;
					int last=-1;
					for (; ; ) {
						last=text.IndexOf(hotspot, last+1, sc);
						if (last<0) break;
						nHotspot++;
					}
					if (nHotspot!=0) {
						score3+=nHotspot*hotspotFactor;
						comment+=" "+nHotspot.ToString()+"*"+hotspot;
					}
				}
				foreach (string coldspot in lang.Coldspots) {
					if (coldspot.Length==0) continue;
					int nColdspot=0;
					int last=-1;
					for (; ; ) {
						last=text.IndexOf(coldspot, last+1, sc);
						if (last<0) break;
						nColdspot++;
					}
					if (nColdspot!=0) {
						score3+=nColdspot*coldspotFactor;
						comment+=" "+nColdspot.ToString()+"/"+coldspot;
					}
				}
				int score4=0;
				if (lang.LhsRecognize>0) {
					foreach (string word1 in firstWords) {
						string word2=word1;
						if (!lang.CaseSensitive) word2=word2.ToLower();
						if (!lang.Keywords.Contains(word2)) {
							score4-=keywordFactor;
							comment+=" ^"+word2;
						}
					}
				}
				if (lang.LhsRecognize>1) {
					foreach (string word1 in secondWords) {
						string word2=word1;
						if (!lang.CaseSensitive) word2=word2.ToLower();
						if (!lang.Keywords.Contains(word2)) {
							score4-=keywordFactor;
							comment+=" ^^"+word2;
						}
					}
				}
				// as different languages have a different number of checks, we should
				// normalize to a maximum independent of #checks
				score1*=keywordFactor;
				if (!lang.CaseSensitive) score1/=2;	// reduce VB attractiveness
				score2=score2*nLines*checksFactor/lang.Checks.Length;
				score4*=nLines;
				int score=score1+score2+score3+score4;
				// and store
				scores.Add(lang, score);
				log("> "+lang.Name+" score="+score+comment);
			}

			int max=0;
			int max2=0;
			Language language=null;
			Language language2=null;
			// find top 2 scores
			foreach (Language lang in scores.Keys) {
				int score=scores[lang];
				if (score>max) {
					if (max>=max2) { max2=max; language2=language; }
					max=score;
					language=lang;
				} else {
					if (score>max2) { max2=score; language2=lang; }
				}
			}
			// we want more than one score point per line
			if (10*max<11*nRealLines+10) language=null;
			if (language==null) {
				trust=100;
				log("conclusion: unknown");
			} else {
				log("max="+max+"  max2="+max2);
				trust=(int)(99*Math.Sqrt(1.0-(double)max2/max));
				string lang2="";
				if (language2!=null) lang2=", maybe "+language2.Name;
				log("conclusion: "+language.Name+" ("+trust+"% trust"+lang2+")");
			}
			return language;
		}
	}
}
