
/* OpenWebSpider
*
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.7
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
*
*
* This file is part of OpenWebSpider
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
*/

#ifndef __MISC
#define __MISC

#ifndef WIN32

int GetTickCount()
{
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec*1000L+tv.tv_usec/1000L;
}

void SetConsoleTitle(char* msg)
{
	return;
}

int stricmp(char*a,char*b)
{
	return strcasecmp(a,b);
}

int strnicmp(char*a,char*b,int c)
{
	return strncasecmp(a,b,c);
}

void Sleep(int n)
{
	usleep((unsigned)n*1000);	
	return;
}

char* _strupr(char*a)
{
	int m,i;
	m=strlen(a);
	
	for(i=0;i<m;i++)
		a[i]=(char)toupper(a[i]);
	
	return a;
}

char* _strlwr(char*a)
{
	int m,i;
	m=strlen(a);
	
	for(i=0;i<m;i++)
		a[i]=(char)tolower(a[i]);
	
	return a;
}

int closesocket(int s)
{
	return close(s);
}

int ExitThread(int a)
{
	pthread_exit(&a);
}

int TerminateThread(pthread_t thread,int nothing)
{
	return pthread_cancel(thread);
}

void CloseHandle(HANDLE a)
{
	return;
}

#endif

int InitMysql()
{
	SetConsoleTitle("Connecting to mysql...");
	
	printf("Connecting to Mysql server n.1 (%s)...",MYSQLSERVER1);                 //Hosts
	if(sqlConnect(MYSQLSERVER1, USERDB1, PASSDB1, DB1,&gMysqlDB1, MYSQLSERVER_PORT1)==0)
	{
		fprintf(stderr, "ERROR\r\nFailed to connect to database(%s): Error: %s\r\n",DB1,mysql_error(&gMysqlDB1));
		
		ERROR_LOG(mysql_error(&gMysqlDB1))
			return -1;
	}
	
	printf("OK\r\nConnecting to Mysql server n.2 (%s)...",MYSQLSERVER2);           //Pages
	if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2,&gMysqlDB2, MYSQLSERVER_PORT2)==0)
	{
		fprintf(stderr, "ERROR\r\nFailed to connect to database(%s): Error: %s\r\n",DB2,mysql_error(&gMysqlDB2));
		
		ERROR_LOG(mysql_error(&gMysqlDB2))
			
			mysql_close(&gMysqlDB2);
		return -1;
	}
	
	printf("OK\r\n");
	SetConsoleTitle("Connecting to mysql...OK");

return 1;
}

int InitCrawler(struct sHost currentHst)
{
	memset(iLastPing,0,sizeof(iLastPing));

	printf("\r\n");
	printf("Start Host        : %s\r\n",currentHst.Host);
	printf("Start Page        : %s\r\n", currentHst.Page);
	printf("Scan Mode         : Index\r\n");
	printf("Mode              : %s\r\n",(starthostonly==1)?"Single Host":"Recursive");
	printf("Mysql server n.1  : %s\r\n",MYSQLSERVER1);
	printf("Mysql server n.2  : %s\r\n",MYSQLSERVER2);
    printf(" ---  Global Limits  ---\r\n");
    printf("Max pages         : %i\r\n",CRAWLER_LIMITS.nMaxPagesPerSite);
	printf("Max depth level   : %i\r\n",CRAWLER_LIMITS.nMaxDepthLevel);
    printf("Max seconds       : %i\r\n",CRAWLER_LIMITS.nMaxSecondsPerSite);
    printf("Max bytes         : %i\r\n",CRAWLER_LIMITS.nMaxBytesPerSite);
    printf(" -----------------------\r\n");
	printf("Surfing the net... (press CTRL+C to exit)\r\n");
	
	
	if(actAsAServerPort)
	{
		CreateServerThread(actAsAServerPort);
		Sleep(200);
	}
    
    if(!StartUpWinsock())
	{
		fprintf(stderr,"WSAStartup() error\r\n");
		ERROR_LOG("WSAStartup() error")
			return -1;
	}
	
	/* connect to mysql servers */
	if(InitMysql()==-1)
		return -1;
	
	SetConsoleTitle("Creating temp table...");
	
	do
	{
		RandomTable(gTable);
	}
	while(!CreateTmpTable(gTable));    //Loop until creates a new tmp table!!!

   	signal(SIGINT,  sigdie);
	signal(SIGTERM, sigdie);


return 1;
}

int setHostExtras(int host_id)
{
char sqlQuery[MAXQUERYSIZE];
MYSQL_ROW row;
MYSQL_RES gRes;
MYSQL_RES** tmpRes=NULL;

    tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	if(tmpRes==NULL)
		MemoryCorruptedHandler("setHostExtras");

    sprintf(sqlQuery,"select hostlist_extras.max_pages,hostlist_extras.max_level,hostlist_extras.max_seconds, hostlist_extras.max_bytes from hostlist left outer join hostlist_extras on hostlist.id = hostlist_extras.host_id WHERE hostlist.id = %d ", host_id);
	
	my_mysql_query_and_store_results(&gMysqlDB1, sqlQuery, tmpRes, &gRes, BLOCKDB1);

    if((row = mysql_fetch_row(&gRes)))
	{
		if(row[0])  /* max_pages */
        {
            if( atoi(row[0]) > 0 )
            {
                EXTRA_LIMITS.nMaxPagesPerSite = atoi(row[0]);
            }
        }

        if(row[1])  /* max_level */
        {
            if( atoi(row[1]) > 0 )
            {
                EXTRA_LIMITS.nMaxDepthLevel = atoi(row[1]);
            }
        }

        if(row[2])  /* max_seconds */
        {
            if( atoi(row[2]) > 0 )
            {
                EXTRA_LIMITS.nMaxSecondsPerSite = atoi(row[2]);
            }
        }

        if(row[3])  /* max_bytes */
        {
            if( atoi(row[3]) > 0 )
            {
                EXTRA_LIMITS.nMaxBytesPerSite = atoi(row[3]);
            }
        }
	}

    if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);

return 1;
}

int CrawlerMainLoop(struct sHost currentHst)
{
    /* this is the first URL */
    InitIndexing(currentHst);
    while(1)
    {
        /* set the current host as indexed and try to get another host to be indexed */
        if((ReturnFirstUrl(&currentHst))==-1)
		{
			fprintf(stderr,"\nBuffer empty\n");
			break;
		}

        /* check whether iQuit as been set by ReturnFirstUrl */
        if(iQuit==1)
		{
			break;
		}
        InitIndexing(currentHst);
    }
    
    DoQuit();

return 1;
}

int InitIndexing(struct sHost currentHst)
{
	int condition = 1;
	char* sqlQuery;
	DWORD avgSec;
	time_t long_time;
	struct tm *newtime;
	struct sHost *robots_txt;
	
#ifdef WIN32
	char strTitle[3000];
#endif
	
    iRobCrawlDelay  = 0;
    bRobotsOK       = 0;
    nPagesViewed    = 0;
	bytesDownloaded = 0;
	nErrorPages     = 0;
	startTimeMS     = 0;
    bKillThread     = 0;
    avgSec          = 0;
    EXTRA_LIMITS.nMaxBytesPerSite   = 0;
    EXTRA_LIMITS.nMaxDepthLevel     = 0;
    EXTRA_LIMITS.nMaxPagesPerSite   = 0;
    EXTRA_LIMITS.nMaxSecondsPerSite = 0;
	memset(lstRobotsExclusions,0,sizeof(lstRobotsExclusions));

	sqlQuery = malloc(MAXQUERYSIZE);
	if(sqlQuery==NULL)
		MemoryCorruptedHandler("InitIndexing");

    /* try to free the memory used */
    lstFreeAll(lstFirst);

	
    /* does this host exist? */
    if( currentHst.host_id == 0 )
        currentHst.host_id = GetHostId( currentHst );

    if( currentHst.host_id == 0)   //no
	    //puts current hostname in the db as "Scanning host in progress.." (viewed==2)
		sprintf(sqlQuery,"INSERT INTO hostlist (hostname, port, status, lastvisit) VALUES('%s', %i, 2, curdate());", currentHst.Host, currentHst.port);
	else    //yes
        sprintf(sqlQuery,"UPDATE hostlist SET port=%i, status = 2, lastvisit=curdate() WHERE hostname =\'%s\' limit 1", currentHst.port, currentHst.Host);
		
	
	my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);
	
    /* 8legs mod */
    if( currentHst.host_id == 0 )
	    currentHst.host_id	= GetHostId( currentHst );

    robots_txt=(struct sHost*)malloc(sizeof(struct sHost));
	
	if(robots_txt==NULL)
		MemoryCorruptedHandler("InitIndexing");
	
	currentHst.viewed = 0;
	memcpy(robots_txt,&currentHst,sizeof(struct sHost));
	strcpy(robots_txt->Page, "/robots.txt");
	robots_txt->level = 1;
	robots_txt->type  = 1;
    robots_txt->host_id = currentHst.host_id;

	lstFirst = lstInit(*robots_txt);
	
	FREE(robots_txt);
	
	currentHst.level = 1;
	lstAddHost(&lstFirst,currentHst);
	
	memcpy(&IndexingHost,&currentHst,sizeof(struct sHost));
	
	{
		int (*modInitFilter)(char*, char*);
		char sError[MAXDESCRIPTIONSIZE];
		int ret;
		
		if( ( modInitFilter = GetInitModFunctionHandlerByName("modFilter")) )
		{	
			ret=modInitFilter(currentHst.Host,sError);
			if(ret==0)
			{
				FREE(sqlQuery);
				printf("\nmodInitFilter(): %s\n\n",sError);
				ERROR_LOG(sError);
				return 0;
			}
		}
	}
	
	SetConsoleTitle("...");

    setHostExtras( currentHst.host_id );

    printf(" --- This site Limits ---\r\n");
    printf("Max pages         : %i\r\n", (EXTRA_LIMITS.nMaxPagesPerSite==0) ? CRAWLER_LIMITS.nMaxPagesPerSite : EXTRA_LIMITS.nMaxPagesPerSite);
    printf("Max depth level   : %i\r\n", (EXTRA_LIMITS.nMaxDepthLevel==0) ? CRAWLER_LIMITS.nMaxDepthLevel : EXTRA_LIMITS.nMaxDepthLevel);
    printf("Max seconds       : %i\r\n", (EXTRA_LIMITS.nMaxSecondsPerSite==0) ? CRAWLER_LIMITS.nMaxSecondsPerSite : EXTRA_LIMITS.nMaxSecondsPerSite);
    printf("Max bytes         : %i\r\n", (EXTRA_LIMITS.nMaxBytesPerSite==0) ? CRAWLER_LIMITS.nMaxBytesPerSite : EXTRA_LIMITS.nMaxBytesPerSite);
    printf(" -----------------------\r\n");


	if(bUpdate==0)
	{
		printf("Deleting old index for %s...",currentHst.Host);
		fflush(stdout);
		
		sprintf(sqlQuery,"DELETE ii FROM pagelist, ii WHERE pagelist.hostname =\'%s\' AND ii.pageid = pagelist.id ",currentHst.Host);
		my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);

        sprintf(sqlQuery,"DELETE FROM pagelist WHERE hostname =\'%s\' ",currentHst.Host);
		my_mysql_query(&gMysqlDB2, sqlQuery,NO_BLOCK);
		
		printf("OK\r\n");
		
		printf("Deleting old rels for %s...",currentHst.Host);
		
		sprintf(sqlQuery,"DELETE FROM rels WHERE host_id = %d",currentHst.host_id);
		my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);
		
		printf("OK\r\n");
		
	}
	
    /* set startTimeMS before creating threads */
    startTimeMS=GetTickCount();

	SetConsoleTitle("Creating threads...");
	
	CreateThreads();
	/**************************MT********************************/
	
	printf("\r\n");
	
	time( &long_time ); 
	newtime=localtime(&long_time);
	
	sprintf(startTime,"%i:%i:%i",newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec );
	
	while(condition)
	{
#ifdef WIN32
		sprintf(strTitle,"OpenWebSpiderV%s | Pages: %i | Time: %i sec | host: %s",VERSION,nPagesViewed,(int)((GetTickCount()-startTimeMS)/1000),currentHst.Host);
		SetConsoleTitle(strTitle);
#endif
		CheckThreads();
		
		Sleep(300);
		
		if(iQuit==1)
		{
			printf("\r\n\r\nQuitting: Killing threads...\n\n");
			
			KillThreads();
			
			iQuit=0;
			bKillThread=0;
			
			sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,bytes_downloaded=%d, error_pages=%d WHERE hostname = \'%s\' limit 1" ,nPagesViewed, (int)((GetTickCount()-startTimeMS)/1000), bytesDownloaded, nErrorPages ,currentHst.Host);
			
			printStats(&currentHst,0);
			
			my_mysql_ping(&gMysqlDB1,NO_BLOCK);
			my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);

			FREE(sqlQuery);
			
			FlushTempTable(gTable);
			
			if( bBuildOwsOwnIndex == 1 )
            {
			    /* all pages are swapped to the table pagelist */
			    /* are we using ows own index? */
			    /* if so: build the index for the current hostname */
			    BuildOwsOwnIndex(&currentHst, 1);
            }
	
			CalcPageRank( currentHst );
			
			DoQuit();
			
		}/*if(iQuit==1)*/
		
		if(bKillThread==1)
		{
			SetConsoleTitle("Killing threads");
			KillThreads();
			CreateThreads();
		}/*if(bKillThread==1)*/
		
		
		thrdBlock(BLOCKTHRDHST);
		if(/*iDoNextHost==1 ||*/						/*Switching to the next host*/
			(lstGetNodeByVal(lstFirst,0)==NULL &&
			lstGetNodeByVal(lstFirst,2)==NULL))
		{
			/* set the status of the pages to be indexed and of that in indexing as indexed */
			/*lstSetNodeStatus(lstFirst,0,1);
			lstSetNodeStatus(lstFirst,2,1);*/
			
			thrdUnBlock(BLOCKTHRDHST);
			
			SetConsoleTitle("Killing threads");
			
			bKillThread=1;
			
			KillThreads();
			
			if(iDoNextHost==1)
			{
				//sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%i WHERE hostname =\'%s\' limit 1",nPagesViewed,currentHst.Host);
				sprintf(sqlQuery,"UPDATE hostlist SET status = 1,indexed_pages=%d,time_sec=%d,bytes_downloaded=%d, error_pages=%d WHERE hostname = \'%s\' limit 1" ,nPagesViewed, (int)((GetTickCount()-startTimeMS)/1000), bytesDownloaded, nErrorPages ,currentHst.Host);
				
				my_mysql_ping(&gMysqlDB1,NO_BLOCK);
				my_mysql_query(&gMysqlDB1, sqlQuery,NO_BLOCK);
				
				iDoNextHost=0;
				
			}
			
			FlushTempTable(gTable);
			
            if( bBuildOwsOwnIndex == 1 )
            {
			    /* all pages are swapped to the table pagelist */
			    /* are we using ows own index? */
			    /* if so: build the index for the current hostname */
			    BuildOwsOwnIndex(&currentHst, 1);
            }
			
			CalcPageRank(currentHst);
			
            /* this host has been indexed! Proceed to the next? */
			break;			
			
		}//if(iDoNextHost==1 || (lstGetNodeByVal(lstFirst,0)==NULL && lstGetNodeByVal(lstFirst,2)==NULL))
		
		thrdUnBlock(BLOCKTHRDHST);
		
	}/*while(condition)*/

	FREE(sqlQuery);	
	return 1;
}

int checkLimits()
{
    if( 
       (EXTRA_LIMITS.nMaxPagesPerSite == 0 && CRAWLER_LIMITS.nMaxPagesPerSite>0 && nPagesViewed >= CRAWLER_LIMITS.nMaxPagesPerSite)                                         /* Check the number of pages indexed */
    || (EXTRA_LIMITS.nMaxSecondsPerSite == 0 && CRAWLER_LIMITS.nMaxSecondsPerSite>0 && (int)((GetTickCount()-startTimeMS)/1000) >= CRAWLER_LIMITS.nMaxSecondsPerSite)         /* Check the number of seconds */
    || (EXTRA_LIMITS.nMaxBytesPerSite == 0 && CRAWLER_LIMITS.nMaxBytesPerSite>0 && bytesDownloaded >= CRAWLER_LIMITS.nMaxBytesPerSite)                                      /* Check the number of bytes downloaded */
    || (EXTRA_LIMITS.nMaxPagesPerSite>0 && nPagesViewed >= EXTRA_LIMITS.nMaxPagesPerSite)                                         /* Check the number of pages indexed */
    || (EXTRA_LIMITS.nMaxSecondsPerSite>0 && (int)((GetTickCount()-startTimeMS)/1000) >= EXTRA_LIMITS.nMaxSecondsPerSite)         /* Check the number of seconds */
    || (EXTRA_LIMITS.nMaxBytesPerSite>0 && bytesDownloaded >= EXTRA_LIMITS.nMaxBytesPerSite) 
	|| (CRAWLER_LIMITS.nMaxErrorPerSite>0 && nErrorPages >= CRAWLER_LIMITS.nMaxErrorPerSite)                                         /* Check the number of error pages */
    )
        return 1;

return 0;
}

/*
* flag=0 -> complete stats
* flag=1 -> in-complete stats
* flag=2 -> switched to the next host
*/
void printStats(struct sHost* Host,int flag)
{
	time_t long_time;
	struct tm *newtime;
	FILE* file;
	
	time( &long_time ); 
	newtime=localtime(&long_time);
	
	if(flag==1)
		printf("\r\n + STATS(*)\r\n");
	else if(flag==2)
		printf("\r\n + STATS(2)\r\n");
	else
		printf("\r\n + STATS\r\n");
	
	printf("  - Host:\t\t%s\r\n",Host->Host );
	printf("  - Pages:\t\t%i\r\n",nPagesViewed);
	printf("  - Downloaded:\t\t%i Kb\r\n",(int)bytesDownloaded/1024);
	printf("  - Scan time: %is (%s - %i:%i:%i)\r\n\r\n",(int)((GetTickCount()-startTimeMS)/1000),startTime,newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec  );
	
	if((file = fopen("stats.log","a"))!=NULL)
	{
		if(flag==1)
			fprintf(file," + STATS(*)\r\n");
		else if(flag==2)
			fprintf(file," + STATS(S)\r\n");
		else
			fprintf(file," + STATS\r\n");
		
		fprintf(file,"  - %i\\%i\\%i %i:%i:%i -- OpenWebSpider version: %s --\r\n",newtime->tm_mday ,newtime->tm_mon +1, newtime->tm_year +1900,newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec,VERSION);
		fprintf(file,"  - Host:\t\t\t%s\r\n",Host->Host );
		fprintf(file,"  - Pages:\t\t%i\r\n",nPagesViewed);
		fprintf(file,"  - Downloaded:\t\t%i Kb\r\n",(int)bytesDownloaded/1024);
		fprintf(file,"  - Scan time: %is (%s - %i:%i:%i) \r\n",(int)((GetTickCount()-startTimeMS)/1000),startTime,newtime->tm_hour ,newtime->tm_min ,newtime->tm_sec);
		fprintf(file,"============================================================\r\n\r\n");
		fclose(file);
	}
}

void MemoryCorruptedHandler(char* funct)
{
	printf("\r\n\r\nMemory corrupted\r\n");
	
	if(funct)
		printf("Function: %s\r\n",funct);
	
	printf("Exiting...\r\n\r\n");
	exit(0);
}

void DoQuit()
{
	if(actAsAServerPort)
	{
		printf("\n\nFreeing Sockets...");
		
		closesocket(OWS_Server_fd);
		
		printf("OK\n\n");
	}
	
	iQuit=0;
	bKillThread=0;
	
	DropTempTable(gTable);
	
	mysql_close(&gMysqlDB1);
	mysql_close(&gMysqlDB2);
	
	Sleep(200);
	
	printf("Bye\n\n");
	//getchar();getchar();getchar();getchar();getchar();
	SetConsoleTitle("Bye byE");
	exit(0);
	
	return;
}


#endif

/*EOF*/
