
/* OpenWebSpider
 *
 *  Author:     Stefano Alimonti aka Shen139
 *  Mail:       shen139 [at] openwebspider (dot) org
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __URLFUNCT
#define __URLFUNCT


/* AddUrl
 */
int AddUrl(struct sHost hst, int level,struct sHost* from)
{

	if(stricmp(IndexingHost.Host,hst.Host)!=0)
		AddExternalHost(hst,from);
	else
	{
		if(nRelationships==2)
		{
			pRelationships(from,&hst,nRelationships);
		}

		if(CheckRobotExclusion(hst.Page)==0 || (maxDepthLevel!=0 && level>=maxDepthLevel))
			return -1;

		if(lstGetNodeByHost(lstFirst,hst)==NULL)	//Host is not in list
		{
			hst.level = level+1;
			lstAddHost(&lstFirst,hst);
		}
		else
			return -1;
	}

return 1;
}

int AddExternalHost(struct sHost Host,struct sHost* from)
{
char sqlQuery[MAXQUERYSIZE];
char sError[MAXHOSTSIZE+50];

	if(bTesting==1 || bAddExternalHost==1)
		return 1;

	if(iQuit==1 || bKillThread==1)
		return 1;

	if(strchr(Host.Host ,' ')>Host.Host)
	{
		sprintf(sError,"AddExternalHost(): Found wrong url: %s",Host.Host);
		printf("\r\n %s \r\n",sError);
		ERROR_LOG(sError);

		thrdUnBlock(BLOCKEXH);
		return -1;
	}

	pRelationships(from,&Host,nRelationships);

	sprintf(sqlQuery,"INSERT DELAYED INTO hostlist (hostname,status) VALUES('http://%s',0);",Host.Host);

	my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);

return 1;

/*******************OLD-2005-11-15*************************
MYSQL_RES **tmpRes;
char sqlQuery[MAXQUERYSIZE];
MYSQL_RES gRes;
DWORD b;
char sError[MAXHOSTSIZE+50];


	if(bTesting==1 || bAddExternalHost==1)
		return 1;

	if(iQuit==1 || bKillThread==1)
		return 1;

	thrdBlock(BLOCKEXH);

	if(strchr(Host.Host,' ')>Host.Host)
	{
		sprintf(sError,"AddExternalHost(): Found wrong url: %s",Host.Host);
		printf("\r\n %s \r\n",sError);
		ERROR_LOG(sError);

		thrdUnBlock(BLOCKEXH);
		return -1;
	}

	pRelationships(from,&Host,nRelationships);

	tmpRes=(MYSQL_RES **)malloc(sizeof(MYSQL_RES *));
	
	if(tmpRes==NULL)
		MemoryCorruptedHandler("AddExternalHost");

	sprintf(sqlQuery,"select status from hostlist where hostname = 'http://%s' limit 1",Host.Host);

	if(!my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery,tmpRes,&gRes,BLOCKDB1))
	{}
	else
	{
		thrdUnBlock(BLOCKEXH);

		FREE(tmpRes);

		return -1;
	}

	if((b=(int)mysql_affected_rows(&gMysqlDB1))>0)		//is there the host in the DB?
	{
		if(*tmpRes)
		{
			mysql_free_result(*tmpRes);
		}
		FREE(tmpRes);
	}
	else
	{				//add host
		if(*tmpRes && b==0)
		{
			mysql_free_result(*tmpRes);
		}
		FREE(tmpRes);

		printf("\r\n Adding host: http://%s\r\n",Host.Host);

		sprintf(sqlQuery,"INSERT DELAYED INTO hostlist (hostname,status) VALUES('http://%s',0);",Host.Host);

		my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);
}

	thrdUnBlock(BLOCKEXH);
	
return 1;
*******************OLD-2005-11-09*************************/


}


/* GetDir
 * Page -> dir <-
 * Page = "/dir1/dir2/page.htm" => dir = "/dir1/dir2/"
 */
int GetDir(char* Page,char* dir)
{
int i;
int last=0;

    for(i=0;i<(signed)strlen(Page);i++)
		if(Page[i]=='/')
		   last=i;
        
    strncpy(dir,(last==0) ? "/" : Page,(last==0) ? 1 : last);
    dir[(last==0) ? 1 : last]=0;

	if(dir[strlen(dir)-1]!='/')
		strcat(dir,"/");

return 1;
}


int CheckPage(char* page)
{
int c=0;
int b;
char tmpPage[MAXPAGESIZE+2000];
char rTmpPage[MAXPAGESIZE];
char *rPos;
int i;

	memset(rTmpPage,0,sizeof(rTmpPage));

	strncpy(rTmpPage,page,MIN(strlen(page),MAXPAGESIZE-1));

	if(strlen(page)<2)
		return 1;

	if(page[0]==' ')
		return -1;

	if(page[0]=='.' && page[1]=='/')
		strcpy(rTmpPage,rTmpPage+2);

	for(i=1;rTmpPage[i]!=0 && i<MAXPAGESIZE-1;i++)
	{
		if(rTmpPage[i-1] != '.' && rTmpPage[i] == '.' && rTmpPage[i+1]=='/')
		{
			rTmpPage[i]=0;
			strcat(rTmpPage,rTmpPage+i+2);
			i-=2;
		}
	}

	if(rTmpPage[i-1]=='.')
		rTmpPage[i-1]=0;
	else
		rTmpPage[i]=0;

	if(strstr(rTmpPage,"..")==0)
	{
		strcpy(page,rTmpPage);
		return 1;
	}

	c=0;

	rPos=rTmpPage;

	if(page[0]=='/')
	{
		tmpPage[0]='/';
		tmpPage[1]=0;
	}
	else
		tmpPage[0]=0;

	while(rPos[0]!=0)
	{
		c=strchr(rPos,'/')-rPos;
		
		if((unsigned)c>strlen(rPos) || c<0)
		{
			strcat(tmpPage,rPos);
			break;
		}

		if(rPos[0]==' ')
			return -1;

		if(strncmp(rPos,"..",c)!=0)
		{
			strncat(tmpPage,rPos,c+1);
			tmpPage[strlen(tmpPage)+c+1]=0;
		}
		else
		{
			for(b=strlen(tmpPage)-2;b>0;b--)
			{
				if(tmpPage[b]=='/')
				{
					tmpPage[b+1]=0;
					break;
				}
			}
			if(b==0)
			{
				tmpPage[0] = (tmpPage[0]=='/') ? '/' : '\0';
				tmpPage[1] = '\0';
			}
		}
		rPos+=c+1;
	}

	if(tmpPage[0]==0)
	{
		tmpPage[0]='/';
		tmpPage[1]=0;
	}

	strcpy(page,tmpPage);

return 1;
}

/* PageType
 * Host <-
 * Host->Page = "/test.htm" Host->type = 1 (type htm/html)
 */
int PageType(struct sHost* Host)
{
int i;
char rPage[MAXPAGESIZE];
int bArgs=0;		/*bArgs=1 == the page contains a '?'*/
int slHP;


	if(Host==NULL)
		return -1;

	memset(rPage,0,MAXPAGESIZE);

	strncpy(rPage,Host->Page,MAXPAGESIZE-1);

	if(strchr(rPage,'?')>rPage)				//does this page contain a '?'
	{
		rPage[strchr(rPage,'?')-rPage]=0;	//cut it
		bArgs=1;
	}

	if(Host->Page[strlen(Host->Page)-1]=='/')
	{
		Host->type = 1;			// Html file
	return 1;
	}

	slHP = MIN(strlen(Host->Page),MAXPAGESIZE);
	for(i=0;i<slHP;i++)
	{
		if(Host->Page[i]=='.')
			break;
	}

	if(i==(signed)strlen(Host->Page) && bArgs==0)    //Maybe a directory (no '.' found)
    {
		if(strlen(Host->Page)>=MAXPAGESIZE-1)
			return -1;

		strcat(Host->Page,"/");
		Host->type = 1;            // Html file
	return 1;
	}

	for(i=0;HtmlExtensions[i][0]!=0;i++)
	{
		if(stricmp(rPage+strlen(rPage)-strlen(HtmlExtensions[i]),(char*)HtmlExtensions[i])==0)
		{
			Host->type = 1;        // Html file
		return 1;
		}
	}

	for(i=0;PlainTextExtension[i][0]!=0;i++)
	{
		if(stricmp(rPage+strlen(rPage)-strlen(PlainTextExtension[i]),(char*)PlainTextExtension[i])==0)
		{
			Host->type = 2;
		return 1;
		}
	}

	/*Support for custom extensions*/ /*TO TEST*/
	for(i=0;CustomExtensions[i][0]!=0;i++)
	{
		if(stricmp(rPage+strlen(rPage)-strlen(CustomExtensions[i]),(char*)CustomExtensions[i])==0)
		{
			Host->type = 4;

		return 1;
		}
	}

	if(bArgs==1)
		Host->type = 1;
	else
		Host->type = 3;            //discard it

return 1;
}

/* PortNumFromHostname
 * hostname -><-
 * hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90;
 */
int PortNumFromHostname(char* hostname)
{
unsigned int i;

	for(i=0;i<strlen(hostname);i++)
		if(hostname[i]==':')
			break;

	if(i!=strlen(hostname))
	{
		hostname[i]=0;
		return atoi(hostname+i+1);
	}

return PORT;
}

int GenerateURL(struct sHost Host,char* URL)
{
char port[5];
	sprintf(port,"%d",Host.port);
	strcpy(URL,"http://");
	strcat(URL,Host.Host);
	strcat(URL,":");
	strcat(URL,port);
	strcat(URL,Host.Page);

return 1;
}


/* ParseUrl
 * Url <- sHost
 * Url: "http://www.test.com/page.htm" ==>
 *	==> sHost.Url = Url &&  sHost.Host = "www.test.com" &&  sHost.Page = "page.htm"
 */
int ParseUrl(char* url,struct sHost* sh,struct sHost* currentHost)
{
char tUrl[MAXURLSIZE];
char BaseDir[MAXPAGESIZE];
unsigned int offset=0,i;
char* token1=NULL;
char* tmpPage;

	if(strstr(url,"//P2"))
		printf("OK");

	if(url==NULL || sh==NULL)
		return -1;

	if(strlen(url)>MAXURLSIZE-1)
		return -1;

	if( strnicmp(url,"ftp://",6)==0    || 
		strnicmp(url,"mailto:",7)==0   || 
		strnicmp(url,"about:",6)==0    ||
		strnicmp(url,"irc://",6)==0    ||
		strnicmp(url,"news://",7)==0   ||
		strnicmp(url,"https://",8)==0)    //protocols not supported
			return -1;

	memset(sh,0,sizeof(struct sHost));
	memset(tUrl,0,MAXURLSIZE);

	for(i=0;i<strlen(url);i++)
	{
		if(url[i]=='#')
		{
			url[i]=0;
		break;
		}
	}

	if(url[0]==0)
		return -1;

	if(strnicmp(url,"http://",7)==0)
	{
		if(strlen(url)==7)
			return -1;
		else
			offset=7;
	}

	if(strncmp(url,"//",2)==0)
	{
		if(strlen(url)==7)
			return -1;
		else
			offset=2;
	}

	strncpy(tUrl,url+offset,strlen(url)-offset);
	tUrl[strlen(url)-offset]=0;

	if(offset>0)	//url with prefix: "http://" || "//"
	{
		for(i=0;i<strlen(tUrl);i++)
		{
			if(tUrl[i]=='/' || tUrl[i]=='?')
			{
				token1=tUrl+i;
				break;
			}
		}

		if(token1>tUrl)		//is there a '/'?
		{
			strncpy(sh->Host,tUrl,token1-tUrl);		//yes: the host is the part of the string before '/' and the page the rest
			strncpy(sh->Page,token1,MAXPAGESIZE-1);
			
			if(strnicmp(sh->Page,"mailto:",7)==0)
				return -1;
		}
		else										//no: the host is the url and the page is the index
		{
			strncpy(sh->Host,tUrl,MAXHOSTSIZE-1);
			strcpy(sh->Page,"/");
		}

		sh->port = PortNumFromHostname(sh->Host);
		
		strtrim(sh->Host,sh->Host);

        tmpPage = malloc(MAXPAGESIZE);
		
        strtrim(sh->Page,tmpPage);

        ReplaceStr(tmpPage,sh->Page,"&amp;","&");

        FREE(tmpPage);

		if(CheckPage(sh->Page)==-1)
			return -1;
		return PageType(sh);
	}
	else		//now we expect a relative url
	{
		if(strlen(url)>MAXPAGESIZE-1)
			return -1;

		if(currentHost==NULL)	//if we haven't a reference host we can't continue
			return -1;

		strncpy(sh->Host,currentHost->Host,MIN(MAXHOSTSIZE-1,strlen(currentHost->Host)));

		if(tUrl[0]!='/')	//if the first char is not '/' we must consider the current directory
			GetDir(currentHost->Page,BaseDir);
		else
			BaseDir[0]=0;

		for(i=strlen(tUrl);i>0;i--)		//is there a '.' before last '/'?
			if(tUrl[i]=='/')
				break;
			else if(tUrl[i]=='.')		//yes: this is a page Ex. "/sources.html"
			{
				if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)
					return -1;

				strcpy(sh->Page,BaseDir);
				strcat(sh->Page,tUrl);

                //get the port from the current Host
                sh->port = currentHost->port;

				strtrim(sh->Host,sh->Host);

                tmpPage = malloc(MAXPAGESIZE);

				strtrim(sh->Page,tmpPage);

                ReplaceStr(tmpPage,sh->Page,"&amp;","&");

                FREE(tmpPage);

				if(CheckPage(sh->Page)==-1)
					return -1;
				return PageType(sh);
			}

		//is there a '.' before last '/'? : no: if i==strlen(tUrl) this is a directory Ex. "sample/"
		if((unsigned)i==strlen(tUrl)-1)
		{
			if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)
				return -1;

			strcpy(sh->Page,BaseDir);
			strcat(sh->Page,tUrl);

			//get the port from the current Host
            sh->port = currentHost->port;

			strtrim(sh->Host,sh->Host);

            tmpPage = malloc(MAXPAGESIZE);

            strtrim(sh->Page,tmpPage);

            ReplaceStr(tmpPage,sh->Page,"&amp;","&");

            FREE(tmpPage);

			if(CheckPage(sh->Page)==-1)
				return -1;
			return PageType(sh);
		}
		else		//in this case we have a page like: "dir1/something" we consider it a directory
		{
			if(strlen(BaseDir)+strlen(tUrl)+1>=MAXPAGESIZE)
				return -1;

			strcpy(sh->Page,BaseDir);
			strcat(sh->Page,tUrl);

			if(strchr(tUrl,'?')==NULL)		//if there is a '?' in the "directory" we consider it a page
				strcat(sh->Page,"/");

			//get the port from the current Host
            sh->port = currentHost->port;
			
			strtrim(sh->Host,sh->Host);

            tmpPage = malloc(MAXPAGESIZE);

            strtrim(sh->Page,tmpPage);

            ReplaceStr(tmpPage,sh->Page,"&amp;","&");

            FREE(tmpPage);

			if(CheckPage(sh->Page)==-1)
				return -1;
			return PageType(sh);
		}
	}

}


int pRelationships(struct sHost* links,struct sHost* linked,int level)
{
char sqlQuery[MAXQUERYSIZE];

	if(bTesting==1)
		return 1;

	if( level>0 && level<3 )
	{
		if(level==1)	//saves hostname only
			sprintf(sqlQuery,"INSERT DELAYED INTO rels (host,linkedhost,page,linkedpage) VALUES('http://%s','http://%s','/','/')",(links?links->Host:IndexingHost.Host),linked->Host);
		else if(level==2)
			sprintf(sqlQuery,"INSERT DELAYED INTO rels (host,linkedhost,page,linkedpage) VALUES('http://%s','http://%s','%s','%s')",(links?links->Host:IndexingHost.Host),linked->Host,(links?links->Page:"/"),linked->Page);

		my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);

		return 1;
	}
	else
		return 0;

/*******************OLD-2005-11-15*************************
MYSQL_RES **tmpRes;
char sqlQuery[MAXQUERYSIZE];
MYSQL_RES gRes;

	if(bTesting==1)
		return 1;

	if( level>0 && level<3 )
	{

		tmpRes=(MYSQL_RES **)malloc(sizeof(MYSQL_RES *));

		if(tmpRes==NULL)
			MemoryCorruptedHandler("pRelationships");

		thrdBlock(BLOCKEXH);

		if(level==1)	//saves hostname only
			sprintf(sqlQuery,"select * from rels where host = 'http://%s' AND linkedhost='http://%s' AND page='/' AND linkedpage='/' limit 1",(links?links->Host:IndexingHost.Host),linked->Host);
		else if(level==2)
			sprintf(sqlQuery,"select * from rels where host = 'http://%s' AND linkedhost='http://%s' AND page='%s' AND linkedpage='%s' limit 1",(links?links->Host:IndexingHost.Host),linked->Host,(links?links->Page:"/"),linked->Page);

		if( !my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery,tmpRes,&gRes,BLOCKDB1) )
		{
			if(mysql_affected_rows(&gMysqlDB1)==0)
			{
			
				if(level==1)	//saves hostname only
					sprintf(sqlQuery,"INSERT DELAYED INTO rels (host,linkedhost,page,linkedpage) VALUES('http://%s','http://%s','/','/')",(links?links->Host:IndexingHost.Host),linked->Host);
				else if(level==2)
					sprintf(sqlQuery,"INSERT DELAYED INTO rels (host,linkedhost,page,linkedpage) VALUES('http://%s','http://%s','%s','%s')",(links?links->Host:IndexingHost.Host),linked->Host,(links?links->Page:"/"),linked->Page);

				my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);


				
			}

			if(*tmpRes)
			{
				mysql_free_result(*tmpRes);
			}

			FREE(tmpRes);

		}
		else
			FREE(tmpRes);

		thrdUnBlock(BLOCKEXH);
  
		return 1;
	}
	else
		return 0;
*******************OLD-2005-11-09*************************/
}

/* unencode
 * transform the gave unicoded string in an unencoded string
 */
void unencode(char *src, char *last, char *dest)
{
	for(; src <= last; src++, dest++)
	{
		if(*src == '%') 
		{
		int code;
			if(sscanf(src+1, "%2x", &code) != 1) 
				code = '?';
			*dest = (char)code;
			src +=2;
		}
		else
			*dest = *src;
	}

	*dest = '\0';

return;
}


#endif

/*EOF*/



