// html.cs created with MonoDevelop // User: shen139 at 10:56 18/06/2008 // using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Collections; namespace OpenWebSpiderCS { public class html : http { // FLAG settato dal META "robots"->"NOINDEX" public bool META_ROBOTS_INDEX = true; // FLAG settato dal META "robots"->"NOFOLLOW" public bool META_ROBOTS_FOLLOW = true; // regex usata per estrarre gli URL dalle pagine Regex regexExtractURLs = new Regex( "(?]*\\shref\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])|" + "(?]*\\shref\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>](?(.|\\s)*?))|" + "(?]*\\ssrc\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])" , RegexOptions.IgnoreCase | RegexOptions.Singleline ); /* (?]*\ssrc\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>]) */ Regex regexExtractImages = new Regex("(?]*\\ssrc\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])", RegexOptions.IgnoreCase); Regex regexExtractImagesALT = new Regex("alt\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))", RegexOptions.IgnoreCase); Regex regexExtractImagesTITLE = new Regex("title\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))", RegexOptions.IgnoreCase); /* (?]*\sname\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>]) */ Regex regexExtractMETA = new Regex("(?]*\\sname\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])", RegexOptions.IgnoreCase); // rimuove tutte le occorrenze di un TAG da una pagina HTML public string removeTAG(string HTML, string startTAG, string endTAG) { int startTAGPos, endTAGPos; bool bAgain; do { bAgain = false; startTAGPos = HTML.IndexOf(startTAG,0 ,StringComparison.CurrentCultureIgnoreCase); if ( startTAGPos >= 0 ) { endTAGPos = HTML.IndexOf(endTAG, startTAGPos + 1, StringComparison.CurrentCultureIgnoreCase); if( endTAGPos > startTAGPos ) { HTML = HTML.Remove(startTAGPos, endTAGPos - startTAGPos + endTAG.Length ); bAgain = true; } } System.Threading.Thread.Sleep(0); } while( bAgain ); return HTML; } /* removeHtmlEntity * rimuove le HTML Entity &....; */ private string removeHtmlEntity(string s) { return Regex.Replace(s, "&[^;]{,10}?;", " "); } public string getTitle() { Regex regexExtractTitle = new Regex("\\s*(?<title>[^<]*)\\s*", RegexOptions.IgnoreCase); MatchCollection mcTitle = regexExtractTitle.Matches( HTML ); foreach (Match mMatch in mcTitle) { return mMatch.Groups["title"].ToString(); } return string.Empty; } public string checkMETA() { Regex regexExtractContent = new Regex("content\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))", RegexOptions.IgnoreCase); MatchCollection mcMETA = regexExtractMETA.Matches(HTML); string META_CONTENT; foreach (Match mMatch in mcMETA) { META_CONTENT = string.Empty; MatchCollection mcMETA_CONTENT = regexExtractContent.Matches(mMatch.Groups["meta"].ToString()); foreach (Match mMatchContent in mcMETA_CONTENT) { META_CONTENT = mMatchContent.Groups["text"].ToString(); } if (mMatch.Groups["name"].ToString().Equals("robots",StringComparison.CurrentCultureIgnoreCase)==true ) { // Gestisci NOINDEX e NOFOLLOW handleMETA_ROBOTS(META_CONTENT); } else if (mMatch.Groups["name"].ToString().Equals("description", StringComparison.CurrentCultureIgnoreCase) == true) { } else if (mMatch.Groups["name"].ToString().Equals("keywords", StringComparison.CurrentCultureIgnoreCase) == true) { } } return string.Empty; } public void handleMETA_ROBOTS(string meta_content) { string[] meta_robots_dir = meta_content.Split(','); for (int i = 0; i < meta_robots_dir.Length; i++) { if (meta_robots_dir[i].Trim().ToLower() == "noindex") META_ROBOTS_INDEX = false; if (meta_robots_dir[i].Trim().ToLower() == "nofollow") META_ROBOTS_FOLLOW = false; } return; } public string removeUnWantedChars(string HTML) { // sostituisce tutti i caratteri non parola con uno spazio HTML = Regex.Replace(HTML, "[^\\w;&#@\\.:/\\?]", " "); //sostituisce tutti gli spazi consecutivi con uno spazio singolo return Regex.Replace(HTML, "\\s+", " "); } // prende una pagina HTML e ne estrae il testo public string UnHTML(string HTML) { HTML = removeTAG(HTML,""); HTML = removeTAG(HTML,""); HTML = removeTAG(HTML,""); HTML = Regex.Replace(HTML, "<[^>]+?>", " ", RegexOptions.Multiline); // rimuove le entitā HTML HTML = removeHtmlEntity( HTML ); // remove i caratteri non validi e sostituisce tutti gli spazi consecutivi con uno spazio singolo HTML = removeUnWantedChars( HTML ); HTML = HTML.Trim(); return HTML; } /* GetURLs * estrae tutti gli URL e gli ANCHOR TEXT(dal TAG: A) dai seguenti TAG: * - BASE ... HREF * - A ... HREF * - FRAME ... SRC * - IFRAME ... SRC (?]*\shref\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>])|(?]*\shref\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>](?(.|\s)*?))|(?]*\ssrc\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>]) * * Testo di prova: an1 a2 a3