
/* OpenWebSpider
 *
 *  Author:     Stefano Alimonti aka Shen139
 *  Mail:       shen139 [at] openwebspider (dot) org
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __HTMLFNCT
#define __HTMLFNCT


/* ForgePacket
 * hst -> packet <-
 * hst.Page = "/prova.htm" ==> packet = "GET /prova.htm HTTP/1.1"
 */
int ForgeHTTPPacket(struct sHost hst,char * packet)
{
char unicodedFilename[MAXURLSIZE];
int g=0;
int i;

	for(i=0;(unsigned)i<strlen(hst.Page);i++)
		if(hst.Page[i]==' ')
		{
			strcat(unicodedFilename,"%20");
			g+=3;
		}
		else
		{
			unicodedFilename[g++]=hst.Page[i];
			unicodedFilename[g]=0;
		}

		sprintf(packet,"GET %s HTTP/1.0\r\nAccept: */*\r\nHost: %s\r\nUser-Agent: OpenWebSpider/%s (http://www.openwebspider.org)\r\n\r\n",unicodedFilename,hst.Host,VERSION);

return 1;
}

/* ParseHTTPRequest
 * recvdpkt -> htmlOut <- maxout ->
 * Return the packet without the HTTP header
 */
int ParseHTTPRequest(char* recvdpkt,char* htmlOut,int maxout,char* httpHeader, char* stuff,int level)
{
int c;
char* pTmp;
int loc;
char *sLocation=stuff;
struct sHost locHost;

	if(strnicmp(recvdpkt,"HTTP/1",6)!=0)
		return 0;

	memset(httpHeader,0,MAXHTTPSTATUSSIZE);
	memset(sLocation,0,MAXHOSTSIZE);

	pTmp=strstr(recvdpkt,"\r\n");

	if(pTmp && pTmp>recvdpkt && pTmp-recvdpkt<MAXHTTPSTATUSSIZE)
	{
		strncpy(httpHeader,recvdpkt,pTmp-recvdpkt);
	}

	
/*
	recvdpkt:		"HTTP/1.1 302 Found
					Date: Sun, 27 Mar 2005 09:15:55 GMT
					Server: Apache/1.3.33 (Unix) PHP/4.3.10
					X-Powered-By: PHP/4.3.10
					Location: http://www.openwebspider.org
					Connection: close
					Transfer-Encoding: chunked
					Content-Type: text/html
					"
*/				
	loc=0;

	if(strnicmp(recvdpkt,"HTTP/1.1 302",12)==0)
		loc=1;

	for(c=0;c<(signed)strlen(recvdpkt);c++)
	{
		if(loc==1)
		{
			if(strncmp(recvdpkt+c,"Location",8)==0)
			{
				pTmp=strstr(recvdpkt+c+10,"\r\n");
				if(pTmp)
				{
					if(pTmp-(recvdpkt+c+10)<MAXURLSIZE)
					{
						strncpy(sLocation,recvdpkt+c+10,pTmp-(recvdpkt+c+10));
						ParseUrl(sLocation,&locHost,&IndexingHost);
						AddUrl(locHost,level,NULL);
					}
					else
					{
						strcpy(sLocation,"<Url too long>");
					}
				}
				
			}
		}

		if(strncmp(recvdpkt+c,"\r\n\r\n",4)==0)
		{
			memcpy(htmlOut,recvdpkt+c+4,MIN(maxout-((recvdpkt+c+4)-recvdpkt),MAXPACKETSIZE-1));
			return (strnicmp(recvdpkt,"HTTP/1.1 200",12)==0 || strnicmp(recvdpkt,"HTTP/1.0 200",12)==0)?2:1;
		}
	}

return 0;
}
char* RemoveTag(char* html, char* startTag, char* endTag)
{
char* startTagFound;
char* endTagFound;

    //<!-- remove html comments--> (Marius Roibu)
    startTagFound = strstr(html,startTag);
    if(startTagFound)
        endTagFound = strstr(startTagFound,endTag);

    /* if startTagFound<endTagFound -> HTML is wrong */
    while((startTagFound!= NULL) && (endTagFound!= NULL) && startTagFound<endTagFound )
    {
        endTagFound += strlen(endTag);
        startTagFound[0] = 0;
        strcat(html, " ");
        strcat(html, endTagFound);
        startTagFound = strstr(html,startTag);
        endTagFound = strstr(html,endTag);
    }
    //end alteration

return html;
}

/* BetweenTag
 * html text -> tag -> maxout out <-
 * html: "<p align=center>bye bye</p> ==> "align=center>ciao ciao"
 */
int BetweenTag(char* html, char* tag,char* out,int endwithstarttag,int maxout)
{
char tmptag1[MAXTAGSIZE+1], tmptag2[MAXTAGSIZE+3];
int iRelPos=0;
char* tmpPacket;
char* startTag;
char* tmpP;

	sprintf(tmptag1,"<%s",tag);
	sprintf(tmptag2,"</%s>",tag);

	tmpPacket = malloc(MAXPACKETSIZE);

	if(tmpPacket==NULL)
		MemoryCorruptedHandler("BetweenTag");

	atoupper(html,tmpPacket,MAXPACKETSIZE-1);

	do
	{
		if((startTag=strstr(tmpPacket+iRelPos,_strupr(tmptag1)))<tmpPacket)
		{
			FREE(tmpPacket);
			return -1;
		}

		iRelPos=(startTag-tmpPacket)+strlen(tmptag1);

		//loop untile the tag is followed by a char that's not ' ' or '>' or CRLF or a tab
	/*	tmpPacket[iRelPos]!=' ' && tmpPacket[iRelPos]!='>' && tmpPacket[iRelPos]!='\r' && tmpPacket[iRelPos]!='\n' && tmpPacket[iRelPos]!='\t')
			return BetweenTag(startTag+strlen(tmptag1)+1, tag, out,endwithstarttag,maxout);
	*/
	}while(tmpPacket[iRelPos]!=' ' && tmpPacket[iRelPos]!='>' && tmpPacket[iRelPos]!='\r' && tmpPacket[iRelPos]!='\n' && tmpPacket[iRelPos]!='\t');

	if(endwithstarttag==1)					//Ex. <A href=sample.c>sample code</A>
		tmpP=strstr(tmpPacket+iRelPos,_strupr(tmptag2));
	else							//Ex. <IMG src=sample.jpg>
		tmpP=strchr(tmpPacket+iRelPos,'>');

	if(tmpP>tmpPacket+iRelPos && tmpP-(tmpPacket+iRelPos)<=maxout)
	{
		strncpy(out,html+iRelPos,tmpP-(tmpPacket+iRelPos));
		out[tmpP-(tmpPacket+iRelPos)]=0;
	
		FREE(tmpPacket);

	return iRelPos+1;
	}

	out[0]=0;

	FREE(tmpPacket);

return -1;
}

/* UnHtml
 * html -> text <- 
 * html = "<p align="left"><font face="Arial" size="2">TesT123</font></p>"
 * text => TesT123
 */
int UnHtml(char* html, char* text,int maxout)
{
int i, m, x=0, pOpen=0;
unsigned char curC;

	memset(text,0,maxout);

    RemoveTag(html,"<!--","-->");
    RemoveTag(html,"<script","</script>");
    RemoveTag(html,"<style","</style>");

	m=MIN((signed)strlen(html),maxout);


	for(i=0;i<m;i++)
	{
		curC=html[i];

		if(curC=='<')
		{
			pOpen=1;
			
			/* "a<br>b"  => "a b";
			   "a <br>b" => "a b" */
			if(text[x-1]!=' ')
				text[x++]=' ';
		}
		else
		if(curC=='>')
			pOpen=0;

		if(pOpen==0 && curC!='>')
		{
			/*                     RemoveShit                  */
			if( /*!( (curC>=32 && curC<=126) || (curC>=192 && curC<=255) ) ||*/ curC=='\''  || curC=='\"'  || curC=='\\')
				curC=' ';

			if(x && text[x-1]==' ' && curC==' ')	//if x>0 and last char is space and current char is space -> don't add this char
				continue;
			else
				text[x++]=curC;
		}
	}

return x;
}


/* LookForUrls
 * html -> AddUrl() <-
 */
int LookForUrls(char *html,struct sHost hst)
{
char a2a[MAXTAGLENGTH];			//<a>...........</a>
char tmpurl[MAXURLSIZE];
char trimurl[MAXURLSIZE];
char encodedurl[MAXURLSIZE];
char fnd[MAXDESCRIPTIONSIZE];
char strComment[MAXDESCRIPTIONSIZE];
int strlenhtml;
int c,i,x,tmpc,y;
struct sHost tmphst;
int apix=0;
int stage;
int nUrlFound=0;
struct sHost sBaseHref;
struct sHost* sReferringHost=&hst;

    RemoveTag(html,"<!--","-->");

    strlenhtml=strlen(html);

	for(y=0;taglist[y].flag!=-1;y++)
	{
	tmpc=c=0;
		
		while(c<strlenhtml)
		{
			if((tmpc = BetweenTag(html+c,taglist[y].bTag,a2a,taglist[y].flag ,sizeof(a2a)))==-1)
				break;			

			ReplaceChr(a2a,'\n',' ');
			ReplaceChr(a2a,'\r',' ');

   			c += tmpc+strlen(taglist[y].bTag);     
				
			stage=0;
			x=0;
			apix=0;

			for(i=0;i<(signed)strlen(a2a);i++)
			{
				switch(stage)
				{
				case 0:		//looks for start tag
					if(strnicmp(a2a+i,taglist[y].eTag,strlen(taglist[y].eTag))==0)
					{
						stage=1;	//start tag found
						i+=strlen(taglist[y].eTag);
						memset(tmpurl,0,sizeof(tmpurl));
					}
					break;
				case 1:		//looks for '\"' or '\'' 
					if(a2a[i]=='\"' || a2a[i]=='\'' )					//start '"' found
					{
						stage=2;
						apix=1;
						break;
					}
					else
					if(a2a[i]=='=')
						break;
					if(a2a[i]!=' ' && a2a[i]!='\n' && a2a[i]!='\r')		//If a2a[i] is not a delimiter consider it as data (apix=0)
					{
						stage=2;
						i--;
					}
					break;
				case 2:
                    /* URL too long */
					if(x>=MAXURLSIZE-1)
					{
						stage=3;
						tmpurl[0]=0;
						break;
					}

					if(apix==1) /* "<a href="test.htm">test</a>" OR "<a href='test.htm'>test</a>" */
					{
						if(a2a[i]!='\"' && a2a[i]!='\'' )	//while end '"' is not found
						{
							tmpurl[x++]=a2a[i];
							break;
						}
						else								//end '"' found
						{
							stage=3;
							tmpurl[x]=0;
							break;
						}
					}
					else    /* "<a href=test.htm>test</a>" */
					{
						/*if(strlen(a2a+i)!=1)
						{
							tmpurl[x++]=a2a[i];
							stage=3;
							tmpurl[x]=0;
							break;
						}
						else
                        */
						if(a2a[i]!=' ' && a2a[i]!='>' && a2a[i]!='\"' && a2a[i]!='\'' && strlen(a2a+i)!=1)
						{
							tmpurl[x++]=a2a[i];
							break;
						}
						else								//end '"' found
						{
							stage=3;
							tmpurl[x]=0;
							break;
						}
					}
				} /*switch*/
				if(stage==3)							//exits from for{}
					break;
			} /*for*/
		
		if(stage==3)
			if(tmpurl[0]!=0 && strnicmp(tmpurl,"javascript:",11)!=0)
			{

				strtrim(tmpurl, trimurl);
				memset(encodedurl,0,sizeof(encodedurl));
				unencode(trimurl,trimurl+strlen(trimurl),encodedurl);	//Support 4 unicode

				fnd[0]=0;
				if(stricmp(taglist[y].bTag,"base")==0)	//if TAG is BASE
				{
					if(ParseUrl(encodedurl,&sBaseHref,NULL)==-1)
						continue;
					sReferringHost=&sBaseHref;
					continue;

				}
				else
				if(stricmp(taglist[y].bTag,"a")==0)	//if TAG is A
				{
					//This shit is needed cause a2a doesn't start and doesn't end respectively with <>
					a2a[0]='<';
					strcat(a2a,">");

					//looks for the href's comment <a href...>XXX</a>
					UnHtml(a2a,fnd,sizeof(fnd));

					UnToken(fnd,"\r\n\t",strComment,strlen(fnd));

					OnlyOneSpace(strComment,fnd,sizeof(fnd));
				}

				if(ParseUrl(encodedurl,&tmphst,sReferringHost)==-1)
					continue;

				if(bTokenIn(encodedurl,"<>\r\n\t\\",strlen(encodedurl))==0)
				{
					tmphst.viewed = 0;	
					memcpy(tmphst.Description,fnd,MAXDESCRIPTIONSIZE-1);

					if(tmphst.type != 3)	//Add only HTML or plain/text file or custom html files
					{
						/*if(strchr(tmphst.Page,' ')>tmphst.Page)
							printf("\n\nasd\n\n");
						*/

						nUrlFound++;
						AddUrl(tmphst,hst.level,&hst);
					}
				}
			}
		}
	}

return nUrlFound;
}

#endif

/*EOF*/

