/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.7
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __THREAD
#define __THREAD


#ifdef WIN32
  unsigned thrdML[MAXTHREAD];
  HANDLE thrdhML[MAXTHREAD];

  unsigned thrdServer;
  HANDLE thrdhServer;
#else
  pthread_t thrdML[MAXTHREAD];
  pthread_t thrdServer;
#endif

#ifdef WIN32
  unsigned __stdcall 
#else
  void* 
#endif
mainThread(LPVOID pthrdNum)
{
struct sHost currentHst;
char         packet[MAXPACKETSIZE];
char         html[MAXPACKETSIZE];
DWORD        tStart=0;
SOCKET       sock;
int          snd;
int          maxbytes2recive;
int          recvdbytes;
int          condition=1;
int          thrdNum=(int)pthrdNum;
char         sStdOutTmp[10000];
char         sStdOut[10000];
SOCKADDR_IN  mSaddr;
NODE*        nCur=NULL;
char         httpStatus[MAXHTTPSTATUSSIZE];
int          HttpRequestRet;
char         sLocation[MAXURLSIZE];

	while(condition)
	{
		
		UnBlockAll();
		
		if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1)
		{
			UnBlockAll();
			ExitThread(0);
		}
			
		if(iStop)
		{
			Sleep(500);
			continue;
		}

		thrdBlock(BLOCKTHRDHST);

        if( checkLimits() == 1
            || iDoNextHost==1)	//switch to the next host
		{
            /* set the status of the pages to be indexed as indexed */
			lstSetNodeStatus(lstFirst,0,1);

			/*Un-block all mutexes owned by this thread (only BLOCKTHRDHST) and...*/
			UnBlockAll();

			/* exit */
			ExitThread(0);
		}

	
		if((nCur=lstGetNodeByVal(lstFirst,0))!=NULL)
		{
			if(nCur==NULL || nCur->field==NULL)
			{
				thrdUnBlock(BLOCKTHRDHST);
				continue;
			}
			
			/* robots.txt checked?!? */
			if(bRobotsOK==0)
			{
				/* is this page robots.txt */
				if(!(stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0))
				{
					/* if not: please wait robots.txt */
					thrdUnBlock(BLOCKTHRDHST);
					Sleep(1000);
					continue;
				}
			}

			if(CheckRobotExclusion(((struct sHost*)nCur->field)->Page)==0)
			{
				((struct sHost*)nCur->field)->viewed = 1;
				thrdUnBlock(BLOCKTHRDHST);
				bRobotsOK=1;
				continue; 
			}

			((struct sHost*)nCur->field)->viewed = 2;
			memcpy(&currentHst,((struct sHost*)nCur->field),sizeof(struct sHost));
		}
		else
		{
			thrdUnBlock(BLOCKTHRDHST);
			Sleep(1000);
			continue;
		}

		//TESTING (before: after Unblockall() )
		thrdStatus[thrdNum]=GetTickCount();

		if(currentHst.type == 3)  //current url is not a html page or a plain text file
		{
			if(nCur==NULL || nCur->field==NULL)
				continue;

			((struct sHost*)nCur->field)->viewed = 1;
			thrdUnBlock(BLOCKTHRDHST);
			bRobotsOK=1;
			continue;
		}


		//(1 - Crawl Delay) Lock the mutex
		if(iRobCrawlDelay>0 || iCrawlDelay>0)
		{
			thrdBlock(BLOCKEXCRAWL);

			if(iQuit==1 || bKillThread==1 || bKillThreadReserved==1)
			{
				UnBlockAll();
				ExitThread(0);
			}
			
			Sleep(  (iRobCrawlDelay>0) ? iRobCrawlDelay*1000 : iCrawlDelay );			
		}


		thrdUnBlock(BLOCKTHRDHST);

		tStart= GetTickCount();

		if(!LoadSocket(&sock,&currentHst,&mSaddr))
		{
			closesocket(sock);
			fprintf(stderr,"\r\n(%i) Socket(%s) error\r\n\r\n",thrdNum, currentHst.Host);

			((struct sHost*)nCur->field)->viewed = 1;
			bRobotsOK=1;
			
			continue;
		}

		if (connect(sock, (LPSOCKADDR) &mSaddr, sizeof(mSaddr)) == SOCKET_ERROR)
		{
			fprintf(stderr,"\r\n(%i) Connect(%s) error\r\n\r\n",thrdNum,currentHst.Host);
			closesocket(sock);

			((struct sHost*)nCur->field)->viewed = 1;
			bRobotsOK=1;

			continue;
		}

		memset(packet,0,MAXPACKETSIZE);
		memset(html,0,MAXPACKETSIZE);
		sStdOut[0]=0;

		ForgeHTTPPacket(currentHst,packet);

		snd=SEND(sock,packet);

		if(currentHst.port!=PORT)
				sprintf(sStdOut,"(%i) Current -> http://%s:%i%s (%s)",thrdNum,currentHst.Host,currentHst.port, currentHst.Page,currentHst.Description);
		else
				sprintf(sStdOut,"(%i) Current -> http://%s%s (%s)",thrdNum,currentHst.Host,currentHst.Page,currentHst.Description);

		if(snd<10)
		{
			closesocket(sock);
			strcat(sStdOut,"\t\t[SEND ERROR]\n\n");
			printf("%s",sStdOut);

			((struct sHost*)nCur->field)->viewed = 1;
			bRobotsOK=1;

			continue;
		}


		maxbytes2recive=sizeof(packet);	//set the bytes to recive

		recvdbytes=RecvPackets(&sock,packet,maxbytes2recive);
		closesocket(sock);

		//(2 - Crawl Delay) File recived unlock the mutex
		if(iRobCrawlDelay>0 || iCrawlDelay>0)
		{
			thrdUnBlock(BLOCKEXCRAWL);
		}


		if(recvdbytes<=10)
		{
			((struct sHost*)nCur->field)->viewed = 1;
			strcat(sStdOut,"\t\t[RECV ERROR]\n\n");
			printf("%s",sStdOut);
			bRobotsOK=1;

			continue;
		}

		bytesDownloaded+=recvdbytes;

		if((HttpRequestRet=ParseHTTPRequest(packet,html,recvdbytes,httpStatus,sLocation,currentHst.level))!=0)
		{
			if(strnicmp(httpStatus,"HTTP/1.1 302",12)==0 || strnicmp(httpStatus,"HTTP/1.0 302",12)==0)
				sprintf(sStdOutTmp,"\n - HTTP header: %s\n - Location: %s\n - Downloaded %i Kb (%i bytes) in %i ms\n" ,httpStatus,sLocation,recvdbytes/1024,recvdbytes,(int)(GetTickCount()-tStart));
			else
				sprintf(sStdOutTmp,"\n - HTTP header: %s\n - Downloaded %i Kb (%i bytes) in %i ms\n" ,httpStatus,recvdbytes/1024,recvdbytes,(int)(GetTickCount()-tStart));

			strcat(sStdOut,sStdOutTmp);

			if(strnicmp(httpStatus,"HTTP/1.1 4",10)==0 || strnicmp(httpStatus,"HTTP/1.0 4",10)==0 || strnicmp(httpStatus,"HTTP/1.1 5",10)==0 || strnicmp(httpStatus,"HTTP/1.0 5",10)==0)
			{
				nErrorPages++;
			}

		}
		else
		{
			((struct sHost*)nCur->field)->viewed = 1;
			strcat(sStdOut,"\r\n");
			printf("%s",sStdOut);
			bRobotsOK=1;

			continue;
		}
		
		if(bRobotsOK==0 && stricmp(((struct sHost*)nCur->field)->Page,"/robots.txt")==0)
		{
			printf("%s",sStdOut);

			if(HttpRequestRet==2)
				ParseRobotsTxt(html,currentHst);
			else
			{
				printf(" - Nothing to do with robots.txt\n\n");

			}

			((struct sHost*)nCur->field)->viewed = 1;
			bRobotsOK=1;

			continue;
		}
		else
			bRobotsOK=1;
		
		
        //Index only HTML(1),plain text files(2)and custom handled files(4)
		if(currentHst.type <= 2 || currentHst.type == 4)
		{

			/* Check the number of pages indexed or if we are switching to the next host */
            if( checkLimits() == 1
                || iDoNextHost==1)	//switch to the next host
			{
				((struct sHost*)nCur->field)->viewed = 1;
				continue;
			}
			
			nPagesViewed++;

			if(currentHst.type == 1)					//Looks for urls only in html page
			{
				tStart=GetTickCount();
				sprintf(sStdOutTmp," - Checked in %i ms (%i URL found)\n",(int)(GetTickCount()-tStart),LookForUrls(html,currentHst));
				strcat(sStdOut,sStdOutTmp);
			}

			tStart=GetTickCount();

			if(HttpRequestRet==2)	//Index only 200 OK
			{
				if(bUseRegularExpressionA==1)	//are we using a regular expression filter?
				{	//yes
					if(regexec(&regexPageFilter, currentHst.Page, 0, 0, 0) == 0)
					{	//match...index
						tStart=GetTickCount();
						
						if(IndexPage(html,currentHst, recvdbytes)==1)
							sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart));
						else
							sprintf(sStdOutTmp,"\n");

						strcat(sStdOut,sStdOutTmp);
					}
					else
					{	//discard
						sprintf(sStdOutTmp,"\n");
						strcat(sStdOut,sStdOutTmp);
					}
				
				}
				else
				{	//index

					tStart=GetTickCount();
					if(IndexPage(html,currentHst, recvdbytes)==1)
						sprintf(sStdOutTmp," - Indexed in %i ms\n\n",(int)(GetTickCount()-tStart));
					else
						sprintf(sStdOutTmp,"\n");

					strcat(sStdOut,sStdOutTmp);
				}
			}

			printf("%s",sStdOut);

		}

		((struct sHost*)nCur->field)->viewed = 1;


	}/*while(condition)*/

return 0;
}

void KillThreads()
{
int i;

	printf("Killing Threads...\r\n\r\n");

	for(i=0;i<nThread;i++)
	{
#ifdef WIN32
		WaitForSingleObject(thrdhML[i],50000);
		TerminateThread(thrdhML[i],0);
		CloseHandle(thrdhML[i]);
#else
		if(thrdML[i]!=0)
			pthread_join(thrdML[i],NULL);
#endif
	}


	init_mutex();

	printf("Threads killed\r\n\r\n");

	//set all nodes with status==2(indexing) with status=1(indexed)
	lstSetNodeStatus(lstFirst, 2, 1);

	/* if the downloading of the robots.txt timeouts the line above set the file as indexed 
	 * but bRobotsOK is set as 0 (un-parsed) and blocks the spider 
	 * so we have to manually set it a 1 (parsed)
	 */
	bRobotsOK = 1;
	
	/* debug */
	//lstDebugNodes(lstFirst,0);
	//lstDebugNodes(lstFirst,2);
	//lstDebugNodes(lstFirst,1);

	bKillThread=0;

}

void CreateThreads()
{
int i;
int errorCode;

	init_mutex();

    printf("\r\n");

    for(i=0;i<nThread;i++)
    {
            printf("\rCreating thread %i of %i     ",i+1,nThread);
            fflush(stdout);

#ifdef WIN32
            thrdhML[i] = (HANDLE)_beginthreadex(NULL,0,mainThread,(void*)i,0,&thrdML[i]);
#else
            if( (errorCode=pthread_create(&thrdML[i], NULL, mainThread, (void*)i)) != 0 )
            {
                    printf("\r\nThread error (%i):\r\n",errorCode);
                    perror(" -    pthread_create() ");
                    exit(0);
            }
#endif
            thrdStatus[i]=GetTickCount();
    }
    printf("\r\n");
return;
}

void CreateServerThread(int port)
{

#ifdef WIN32
	thrdhServer = (HANDLE)_beginthreadex(NULL,0,StartOWSServer,(void*)port,0,&thrdServer);
#else
int errorCode;
    if( (errorCode=pthread_create(&thrdServer, NULL, StartOWSServer, (void*)port)) != 0 )
    {
            printf("\r\nThread error (%i):\r\n",errorCode);
            perror(" -    pthread_create() ");
            exit(0);
    }
#endif

}

void CreateHandleConnectionThread(struct sHandleConnection* struct_connection)
{

#ifdef WIN32
	_beginthreadex(NULL,0,HandleConnection,(void*)struct_connection,0,NULL);
#else
int errorCode;
pthread_t ptTmp;
    if( (errorCode=pthread_create(&ptTmp, NULL, HandleConnection, (void*)struct_connection)) != 0 )
    {
            printf("\r\nThread error (%i):\r\n",errorCode);
            perror(" -    pthread_create() ");
            exit(0);
    }
#endif

}


void CheckThreads()
{
DWORD curTickCount;
int i;
int avgSec;

	if(iDoNextHost==0)
	{
		avgSec=0;
		curTickCount=GetTickCount();
		for(i=0;i<nThread;i++)	//Check the status of the threads
		{
			if(curTickCount>thrdStatus[i])
				avgSec+=(curTickCount-thrdStatus[i]);
			else
				avgSec++;

		}//for(i=0;i<nThread;i++)

		avgSec/=nThread;

		if(avgSec>AVGTHREADDELAY)
		{
			ERROR_LOG("Notice: Killing thrads avgSec>100000");
			bKillThread=1;
		}
	}//if(iDoNextHost==0)

return;
}

#endif

/*EOF*/

