
/* OpenWebSpider
*
*  Authors:     Stefano Alimonti AND Stefano Fantin
*  Version:     0.7
*  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
*
*
* This file is part of OpenWebSpider
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
*/

#ifndef __URLFUNCT
#define __URLFUNCT


/* ReturnFirstUrl
* Host <-
* Set current Host as indexed and return the first host found in the list if available
*/
int ReturnFirstUrl(struct sHost* Host)
{
	MYSQL_ROW row;
	char sqlQuery[MAXQUERYSIZE];
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	
	if(Host==NULL)
	{
		printf("Critical error\r\n\r\n");
		return -1;
	}
	
	my_mysql_ping(&gMysqlDB1,BLOCKDB1);
	
	//insert current host as viewed (hostlist.status=1)
	if(Host->Host[0]!=0)
	{
		sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,bytes_downloaded=%d, error_pages=%d WHERE hostname = \'%s\' limit 1"
,nPagesViewed, (int)((GetTickCount()-startTimeMS)/1000), bytesDownloaded, nErrorPages ,Host->Host);
		
		my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);
	}
	
	printStats(Host,(iDoNextHost==1)?2:0);
	
	if(starthostonly==1)
	{
		iQuit=1;
		return 1;
	}
	
	/* we have to crawl the current host (ows server switch to an input-defined host) */
    if(nextHost)
    {
        AddExternalHost(*nextHost, NULL);
        sprintf(sqlQuery,"(select CONCAT('http://', hostname),port, id from hostlist where hostname='%s' and port = %i) union all (select CONCAT('http://', hostname),port, id from hostlist where status=0 ORDER BY priority DESC) limit 1",nextHost->Host, nextHost->port);
		
        FREE(nextHost);
        nextHost = NULL;
    }
    else
        sprintf(sqlQuery,"select CONCAT('http://', hostname) ,port, id from hostlist where status=0 ORDER BY priority DESC, id limit 1");
	
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	if(tmpRes==NULL)
		MemoryCorruptedHandler("ReturnFirstUrl");
	
	my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery,tmpRes,&gRes,BLOCKDB1);	//May return null!!!
	
	if(mysql_affected_rows(&gMysqlDB1)==0)			//all buffers empty
	{
		if(*tmpRes)
		{
			mysql_free_result(*tmpRes);
		}
		
		FREE(tmpRes);
		
		return -1;
	}
	else
	{	
		if((row = mysql_fetch_row(&gRes))==NULL)	//?!?
		{
			if(*tmpRes)
			{
				mysql_free_result(*tmpRes);
			}
			
			FREE(tmpRes);
			
			return -1;					//there's no results (it seems so...)
		}
		
		if(ParseUrl(row[0],Host,NULL)==-1)		//Wrong URL???
		{
			if(*tmpRes)
			{
				mysql_free_result(*tmpRes);
			}
			
			FREE(tmpRes);
			
			return 0;
		}
		
		Host->port = atoi(row[1]);
        Host->host_id = atoi(row[3]);
		
		if(*tmpRes)
		{
			mysql_free_result(*tmpRes);
		}
		
		FREE(tmpRes);
				
		return 1;
	}
}


/* AddUrl
*/
int AddUrl(struct sHost hst, unsigned int level,struct sHost* from)
{
	char* sqlQuery; 
	
	
	/* if the host of the current page is the same of that we are indexing and Free Indexing Mode is off*/
	/* bFreeIndexingMode == 1 == Index all pages of the current host and not */
	if(stricmp(IndexingHost.Host,hst.Host)!=0 && bFreeIndexingMode==0)
		AddExternalHost(hst,from);
	else
	{
		
		/* if we are in the free indexing mode we will index this page as it was of the current indexing host but we must add this host to the table hostlist */
		/* and we must delete the current page from the Index (pagelist) */
		if(bFreeIndexingMode==1)
		{
			AddExternalHost(hst,from);
			
			sqlQuery = malloc(MAXQUERYSIZE);
			snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"DELETE FROM pagelist WHERE hostname =\'%s\' AND page=\'%s\'",hst.Host, hst.Page);
			my_mysql_query(&gMysqlDB2, sqlQuery, BLOCKINDEX);
			FREE(sqlQuery);
		}
        else
        {
            /* we are in the same host */
				if(from)
	            hst.host_id = from->host_id;
        }

        if(hst.host_id==0)
            hst.host_id = GetHostId(hst);
		
		if(nRelationships==2)
		{
			pRelationships(from,&hst,nRelationships);
		}
		
		/* Check the current page against the robots.txt, 
            the maximum level of depth and the maximum number of pages to be indexed,
            the number of seconds,
            the number of bytes downloaded
            or if we are switching to the next host */
		if(
            CheckRobotExclusion(hst.Page)==0 
            || checkLimits() == 1
            || (EXTRA_LIMITS.nMaxDepthLevel == 0 && CRAWLER_LIMITS.nMaxDepthLevel>0 && level >= CRAWLER_LIMITS.nMaxDepthLevel)
            || (EXTRA_LIMITS.nMaxDepthLevel>0 && level >= EXTRA_LIMITS.nMaxDepthLevel)
            || iDoNextHost==1)
			return -1;
		
		if(lstGetNodeByHost(lstFirst,hst)==NULL)	//Host is not in list
		{
			hst.level = level+1;
			lstAddHost(&lstFirst,hst);
		}
		else
			return -1;
	}
	
	return 1;
}

int AddExternalHost(struct sHost Host,struct sHost* from)
{
	char* sqlQuery;
	char sError[MAXHOSTSIZE+50];
	
	if(bTesting==1 || bAddExternalHost==1)
		return 1;
	
	if(iQuit==1 || bKillThread==1)
		return 1;
	
	if(strchr(Host.Host ,' ')>Host.Host)
	{
		sprintf(sError,"AddExternalHost(): Found wrong url: %s",Host.Host);
		printf("\r\n %s \r\n",sError);
		ERROR_LOG(sError);
		
		thrdUnBlock(BLOCKEXH);
		return -1;
	}
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
    snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO hostlist (hostname,port,status) VALUES('%s',%d, 0);",Host.Host, Host.port);
	my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);
	
	FREE(sqlQuery);


	/* "from" could be NULL (ows server switch to an input-defined host) */
	if(from)
		pRelationships(from,&Host,nRelationships);

	return 1;
}


/* GetDir
* Page -> dir <-
* Page = "/dir1/dir2/page.htm" => dir = "/dir1/dir2/"
*/
int GetDir(char* Page,char* dir)
{
int i;
int last=0;
char* tmpPage;
char* tmpP;

    tmpPage = malloc(strlen(Page)+5);

    strcpy(tmpPage, Page);
    
    tmpP = strchr(tmpPage,'?');

    if( tmpP > tmpPage)
        tmpPage[tmpP-tmpPage]=0;
	
    for(i=0;i<(signed)strlen(tmpPage);i++)
		if(tmpPage[i]=='/')
			last=i;
        
	strncpy(dir,(last==0) ? "/" : tmpPage,(last==0) ? 1 : last);
	dir[(last==0) ? 1 : last]=0;
	
	if(dir[strlen(dir)-1]!='/')
		strcat(dir,"/");
	
    FREE(tmpPage);

return 1;
}


int CheckPage(char* page)
{
	int c=0;
	int b;
	char tmpPage[MAXPAGESIZE+2000];
	char rTmpPage[MAXPAGESIZE];
	char *rPos;
	int i;
	
	memset(rTmpPage,0,sizeof(rTmpPage));
	
	strncpy(rTmpPage,page,MIN(strlen(page),MAXPAGESIZE-1));
	
	if(strlen(page)<2)
		return 1;
	
	if(page[0]==' ')
		return -1;
	
	if(page[0]=='.' && page[1]=='/')
		strcpy(rTmpPage,rTmpPage+2);
	
	for(i=1;rTmpPage[i]!=0 && i<MAXPAGESIZE-1;i++)
	{
		if(rTmpPage[i-1] != '.' && rTmpPage[i] == '.' && rTmpPage[i+1]=='/')
		{
			rTmpPage[i]=0;
			strcat(rTmpPage,rTmpPage+i+2);
			i-=2;
		}
	}
	
	if(rTmpPage[i-1]=='.')
		rTmpPage[i-1]=0;
	else
		rTmpPage[i]=0;
	
	if(strstr(rTmpPage,"..")==0)
	{
		strcpy(page,rTmpPage);
		return 1;
	}
	
	c=0;
	
	rPos=rTmpPage;
	
	if(page[0]=='/')
	{
		tmpPage[0]='/';
		tmpPage[1]=0;
	}
	else
		tmpPage[0]=0;
	
	while(rPos[0]!=0)
	{
		c=strchr(rPos,'/')-rPos;
		
		if((unsigned)c>strlen(rPos) || c<0)
		{
			strcat(tmpPage,rPos);
			break;
		}
		
		if(rPos[0]==' ')
			return -1;
		
		if(strncmp(rPos,"..",c)!=0)
		{
			strncat(tmpPage,rPos,c+1);
			tmpPage[strlen(tmpPage)+c+1]=0;
		}
		else
		{
			for(b=strlen(tmpPage)-2;b>0;b--)
			{
				if(tmpPage[b]=='/')
				{
					tmpPage[b+1]=0;
					break;
				}
			}
			if(b==0)
			{
				tmpPage[0] = (tmpPage[0]=='/') ? '/' : '\0';
				tmpPage[1] = '\0';
			}
		}
		rPos+=c+1;
	}
	
	if(tmpPage[0]==0)
	{
		tmpPage[0]='/';
		tmpPage[1]=0;
	}
	
	strcpy(page,tmpPage);
	
	return 1;
}

/* PageType
* Host <-
* Host->Page = "/test.htm" Host->type = 1 (type htm/html)
*/
int PageType(struct sHost* Host)
{
	int i;
	char rPage[MAXPAGESIZE];
	int bArgs=0;		/*bArgs=1 == the page contains a '?'*/
	int slHP;
	
	
	if(Host==NULL)
		return -1;
	
	memset(rPage,0,MAXPAGESIZE);
	
	strncpy(rPage,Host->Page,MAXPAGESIZE-1);
	
	if(strchr(rPage,'?')>rPage)				//does this page contain a '?'
	{
		rPage[strchr(rPage,'?')-rPage]=0;	//cut it
		bArgs=1;
	}
	
	if(Host->Page[strlen(Host->Page)-1]=='/')
	{
		Host->type = 1;			// Html file
		return 1;
	}
	
	slHP = MIN(strlen(Host->Page),MAXPAGESIZE);
	for(i=0;i<slHP;i++)
	{
		if(Host->Page[i]=='.')
			break;
	}
	
	if(i==(signed)strlen(Host->Page) && bArgs==0)    //Maybe a directory (no '.' found)
    {
		if(strlen(Host->Page)>=MAXPAGESIZE-1)
			return -1;
		
		strcat(Host->Page,"/");
		Host->type = 1;            // Html file
		return 1;
	}
	
	for(i=0;HtmlExtensions[i][0]!=0;i++)
	{
		if(stricmp(rPage+strlen(rPage)-strlen(HtmlExtensions[i]),(char*)HtmlExtensions[i])==0)
		{
			Host->type = 1;        // Html file
			return 1;
		}
	}
	
	for(i=0;PlainTextExtension[i][0]!=0;i++)
	{
		if(stricmp(rPage+strlen(rPage)-strlen(PlainTextExtension[i]),(char*)PlainTextExtension[i])==0)
		{
			Host->type = 2;
			return 1;
		}
	}
	
	/*Support for custom extensions*/ /*TO TEST*/
	for(i=0;CustomExtensions[i][0]!=0;i++)
	{
		if(stricmp(rPage+strlen(rPage)-strlen(CustomExtensions[i]),(char*)CustomExtensions[i])==0)
		{
			Host->type = 4;
			
			return 1;
		}
	}
	
	if(bArgs==1)
		Host->type = 1;
	else
		Host->type = 3;            //discard it
	
	return 1;
}

/* PortNumFromHostname
* hostname -><-
* hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90;
*/
unsigned int PortNumFromHostname(char* hostname)
{
	unsigned int i;
	
	for(i=0;i<strlen(hostname);i++)
		if(hostname[i]==':')
			break;
		
		if(i!=strlen(hostname))
		{
			hostname[i]=0;
			return (unsigned)atoi(hostname+i+1);
		}
		
		return PORT;
}

int GenerateURL(struct sHost Host,char* URL)
{
	char port[5];
	sprintf(port,"%d",Host.port);
	strcpy(URL,"http://");
	strcat(URL,Host.Host);
	strcat(URL,":");
	strcat(URL,port);
	strcat(URL,Host.Page);
	
	return 1;
}


/* ParseUrl
* Url <- sHost
* Url: "http://www.test.com/page.htm" ==>
*	==> sHost.Url = Url &&  sHost.Host = "www.test.com" &&  sHost.Page = "page.htm"
*/
int ParseUrl(char* url,struct sHost* sh,struct sHost* currentHost)
{
	char tUrl[MAXURLSIZE];
	char BaseDir[MAXPAGESIZE];
	unsigned int offset=0,i;
	char* token1=NULL;
	char* tmpPage;
	
	if(url==NULL || sh==NULL)
		return -1;
	
	if(strlen(url)>MAXURLSIZE-1)
		return -1;
	
	if( strnicmp(url,"ftp://",6)==0    || 
		strnicmp(url,"mailto:",7)==0   || 
		strnicmp(url,"about:",6)==0    ||
		strnicmp(url,"irc://",6)==0    ||
		strnicmp(url,"news://",7)==0   ||
		strnicmp(url,"https://",8)==0)    //protocols not supported
		return -1;
	
	memset(sh,0,sizeof(struct sHost));
	memset(tUrl,0,MAXURLSIZE);
	
	for(i=0;i<strlen(url);i++)
	{
		if(url[i]=='#')
		{
			url[i]=0;
			break;
		}
	}
	
	if(url[0]==0)
		return -1;
	
	if(strnicmp(url,"http://",7)==0)
	{
		if(strlen(url)==7)
			return -1;
		else
			offset=7;
	}
	
	if(strncmp(url,"//",2)==0)
	{
		if(strlen(url)==7)
			return -1;
		else
			offset=2;
	}
	
	strncpy(tUrl,url+offset,strlen(url)-offset);
	tUrl[strlen(url)-offset]=0;
	
	if(offset>0)	//url with prefix: "http://" || "//"
	{
		for(i=0;i<strlen(tUrl);i++)
		{
			if(tUrl[i]=='/' || tUrl[i]=='?')
			{
				token1=tUrl+i;
				break;
			}
		}
		
		if(token1>tUrl)		//is there a '/'?
		{
			strncpy(sh->Host,tUrl,token1-tUrl);		//yes: the host is the part of the string before '/' and the page the rest
			strncpy(sh->Page,token1,MAXPAGESIZE-1);
			
			if(strnicmp(sh->Page,"mailto:",7)==0)
				return -1;
		}
		else										//no: the host is the url and the page is the index
		{
			strncpy(sh->Host,tUrl,MAXHOSTSIZE-1);
			strcpy(sh->Page,"/");
		}
		
		sh->port = PortNumFromHostname(sh->Host);

		strtrim(sh->Host,sh->Host);
		
        tmpPage = malloc(MAXPAGESIZE);
		
        strtrim(sh->Page,tmpPage);
		
        ReplaceStr(tmpPage,sh->Page,"&amp;","&");
		
        FREE(tmpPage);

        /* currentHost has the same hostname and port and has an host_id */
        if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )
        {
            /* yes: this page is from the same domain: use currentHost host_id */
            sh->host_id = currentHost->host_id;
        }
		
		if(CheckPage(sh->Page)==-1)
			return -1;
		return PageType(sh);
	}
	else		//now we expect a relative url
	{
		if(strlen(url)>MAXPAGESIZE-1)
			return -1;
		
		if(currentHost==NULL)	//if we haven't a reference host we can't continue
			return -1;
		
		strncpy(sh->Host,currentHost->Host,MIN(MAXHOSTSIZE-1,strlen(currentHost->Host)));
		
		if(tUrl[0]!='/')	//if the first char is not '/' we must consider the current directory
			GetDir(currentHost->Page,BaseDir);
		else
			BaseDir[0]=0;
		
		for(i=strlen(tUrl);i>0;i--)		//is there a '.' before last '/'?
			if(tUrl[i]=='/')
				break;
			else if(tUrl[i]=='.')		//yes: this is a page Ex. "/sources.html"
			{
				if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)
					return -1;
				
				strcpy(sh->Page,BaseDir);
				strcat(sh->Page,tUrl);
				
                //get the port from the current Host
                sh->port = currentHost->port;
				
				strtrim(sh->Host,sh->Host);
				
                tmpPage = malloc(MAXPAGESIZE);
				
				strtrim(sh->Page,tmpPage);
				
                ReplaceStr(tmpPage,sh->Page,"&amp;","&");
				
                FREE(tmpPage);

                /* currentHost has the same hostname and port and has an host_id */
                if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )
                {
                    /* yes: this page is from the same domain: use currentHost host_id */
                    sh->host_id = currentHost->host_id;
                }
				
				if(CheckPage(sh->Page)==-1)
					return -1;
				return PageType(sh);
			}
			
			//is there a '.' before last '/'? : no: if i==strlen(tUrl) this is a directory Ex. "sample/"
			if((unsigned)i==strlen(tUrl)-1)
			{
				if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)
					return -1;
				
				strcpy(sh->Page,BaseDir);
				strcat(sh->Page,tUrl);
				
				//get the port from the current Host
				sh->port = currentHost->port;
				
				strtrim(sh->Host,sh->Host);
				
				tmpPage = malloc(MAXPAGESIZE);
				
				strtrim(sh->Page,tmpPage);
				
				ReplaceStr(tmpPage,sh->Page,"&amp;","&");
				
				FREE(tmpPage);

                /* currentHost has the same hostname and port and has an host_id */
                if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )
                {
                    /* yes: this page is from the same domain: use currentHost host_id */
                    sh->host_id = currentHost->host_id;
                }
				
				if(CheckPage(sh->Page)==-1)
					return -1;
				return PageType(sh);
			}
			else		//in this case we have a page like: "dir1/something" we consider it a directory
			{
				if(strlen(BaseDir)+strlen(tUrl)+1>=MAXPAGESIZE)
					return -1;
				
				strcpy(sh->Page,BaseDir);
				strcat(sh->Page,tUrl);
				
				if(strchr(tUrl,'?')==NULL)		//if there is a '?' in the "directory" we consider it a page
					strcat(sh->Page,"/");
				
				//get the port from the current Host
				sh->port = currentHost->port;
				
				strtrim(sh->Host,sh->Host);
				
				tmpPage = malloc(MAXPAGESIZE);
				
				strtrim(sh->Page,tmpPage);
				
				ReplaceStr(tmpPage,sh->Page,"&amp;","&");
				
				FREE(tmpPage);

                /* currentHost has the same hostname and port and has an host_id */
                if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )
                {
                    /* yes: this page is from the same domain: use currentHost host_id */
                    sh->host_id = currentHost->host_id;
                }
				
				if(CheckPage(sh->Page)==-1)
					return -1;
				return PageType(sh);
			}
	}
	
}

/* GetHostId
*  if the host exists in the table hostlist returns its id
*  else returns 0
*/
int GetHostId(struct sHost host)
{
	char* sqlQuery;
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	MYSQL_ROW row;
	unsigned int ret;

	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
	if(tmpRes==NULL || sqlQuery==NULL)
		MemoryCorruptedHandler("GetHostId");
	
    snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.hostlist WHERE hostname='%s' AND port = %d LIMIT 1", DB1, host.Host, host.port);
	
	my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);
	
	FREE(sqlQuery);
	
	row = mysql_fetch_row(&gRes);
	
	if(row)
		ret = atoi(row[0]);
	else
		ret = 0;
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);
	
	
	return ret;
}


int pRelationships(struct sHost* links,struct sHost* linked,int level)
{
	char* sqlQuery;
	int host_id;
	int linkedhost_id;
	
	if(bTesting==1)
		return 1;
	
	if( level>0 && level<3 )
	{
        if( ((links? (*links) : IndexingHost ).host_id) == 0 )
		    host_id			= GetHostId( (links? *links:IndexingHost) );
        else
            host_id = ((links? (*links) : IndexingHost ).host_id);

        if( linked->host_id == 0 )
		    linkedhost_id	= GetHostId( *linked );
        else
            linkedhost_id = linked->host_id; 

		if( host_id==0 || linkedhost_id==0 )
			return 0;

		sqlQuery = malloc(MAXQUERYSIZE);

        if(level==1)	//saves hostname only
			snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) VALUES(%d,%d,'/','/', '%s')",host_id,linkedhost_id , linked->Description );
		else if(level==2)
			snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) VALUES(%d,%d,'%s','%s', '%s')",host_id,linkedhost_id,(links?links->Page:"/"),linked->Page , linked->Description );
		
		my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);
		FREE(sqlQuery);
		
		return 1;
	}
	else
		return 0;
}

/* unencode
* transform the gave unicoded string in an unencoded string
*/
void unencode(char *src, char *last, char *dest)
{
	for(; src <= last; src++, dest++)
	{
		if(*src == '%') 
		{
			int code;
			if(sscanf(src+1, "%2x", &code) != 1) 
				code = '?';
			*dest = (char)code;
			src +=2;
		}
		else
			*dest = *src;
	}
	
	*dest = '\0';
	
	return;
}


#endif

/*EOF*/



