// html.cs created with MonoDevelop
// User: shen139 at 10:56 18/06/2008
//
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
namespace OpenWebSpiderCS
{
public class html : http
{
// FLAG settato dal META "robots"->"NOINDEX"
public bool META_ROBOTS_INDEX = true;
// FLAG settato dal META "robots"->"NOFOLLOW"
public bool META_ROBOTS_FOLLOW = true;
// regex usata per estrarre gli URL dalle pagine
Regex regexExtractURLs = new Regex(
"(?]*\\shref\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])|"
+ "(?]*\\shref\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>](?(.|\\s)*?))|"
+ "(?]*\\ssrc\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])"
, RegexOptions.IgnoreCase | RegexOptions.Singleline );
/*
(?
]*\ssrc\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>])
*/
Regex regexExtractImages = new Regex("(?
]*\\ssrc\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])", RegexOptions.IgnoreCase);
Regex regexExtractImagesALT = new Regex("alt\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))", RegexOptions.IgnoreCase);
Regex regexExtractImagesTITLE = new Regex("title\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))", RegexOptions.IgnoreCase);
/*
(?]*\sname\s*=\s*(?:(?:[\"\'](?[^\"\']*)[\"\'])|(?[^\s>]*))[^>]*[>])
*/
Regex regexExtractMETA = new Regex("(?]*\\sname\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))[^>]*[>])", RegexOptions.IgnoreCase);
// rimuove tutte le occorrenze di un TAG da una pagina HTML
public string removeTAG(string HTML, string startTAG, string endTAG)
{
int startTAGPos, endTAGPos;
bool bAgain;
do
{
bAgain = false;
startTAGPos = HTML.IndexOf(startTAG,0 ,StringComparison.CurrentCultureIgnoreCase);
if ( startTAGPos >= 0 )
{
endTAGPos = HTML.IndexOf(endTAG, startTAGPos + 1, StringComparison.CurrentCultureIgnoreCase);
if( endTAGPos > startTAGPos )
{
HTML = HTML.Remove(startTAGPos, endTAGPos - startTAGPos + endTAG.Length );
bAgain = true;
}
}
System.Threading.Thread.Sleep(0);
} while( bAgain );
return HTML;
}
/* removeHtmlEntity
* rimuove le HTML Entity &....;
*/
private string removeHtmlEntity(string s)
{
return Regex.Replace(s, "&[^;]{,10}?;", " ");
}
public string getTitle()
{
Regex regexExtractTitle = new Regex("\\s*(?[^<]*)\\s*", RegexOptions.IgnoreCase);
MatchCollection mcTitle = regexExtractTitle.Matches( HTML );
foreach (Match mMatch in mcTitle)
{
return mMatch.Groups["title"].ToString();
}
return string.Empty;
}
public string checkMETA()
{
Regex regexExtractContent = new Regex("content\\s*=\\s*(?:(?:[\\\"\\\'](?[^\\\"\\\']*)[\\\"\\\'])|(?[^\\s>]*))", RegexOptions.IgnoreCase);
MatchCollection mcMETA = regexExtractMETA.Matches(HTML);
string META_CONTENT;
foreach (Match mMatch in mcMETA)
{
META_CONTENT = string.Empty;
MatchCollection mcMETA_CONTENT = regexExtractContent.Matches(mMatch.Groups["meta"].ToString());
foreach (Match mMatchContent in mcMETA_CONTENT)
{
META_CONTENT = mMatchContent.Groups["text"].ToString();
}
if (mMatch.Groups["name"].ToString().Equals("robots",StringComparison.CurrentCultureIgnoreCase)==true )
{
// Gestisci NOINDEX e NOFOLLOW
handleMETA_ROBOTS(META_CONTENT);
}
else if (mMatch.Groups["name"].ToString().Equals("description", StringComparison.CurrentCultureIgnoreCase) == true)
{
}
else if (mMatch.Groups["name"].ToString().Equals("keywords", StringComparison.CurrentCultureIgnoreCase) == true)
{
}
}
return string.Empty;
}
public void handleMETA_ROBOTS(string meta_content)
{
string[] meta_robots_dir = meta_content.Split(',');
for (int i = 0; i < meta_robots_dir.Length; i++)
{
if (meta_robots_dir[i].Trim().ToLower() == "noindex")
META_ROBOTS_INDEX = false;
if (meta_robots_dir[i].Trim().ToLower() == "nofollow")
META_ROBOTS_FOLLOW = false;
}
return;
}
public string removeUnWantedChars(string HTML)
{
// sostituisce tutti i caratteri non parola con uno spazio
HTML = Regex.Replace(HTML, "[^\\w;@\\.:/\\?]", " ");
//sostituisce tutti gli spazi consecutivi con uno spazio singolo
return Regex.Replace(HTML, "\\s+", " ");
}
// prende una pagina HTML e ne estrae il testo
public string UnHTML(string HTML)
{
HTML = removeTAG(HTML,"");
HTML = removeTAG(HTML,"