/* OpenWebSpider
 *
 *  Author:     Stefano Alimonti aka Shen139
 *  Version:    0.6.1
 *  Mail:       shen139 [at] openwebspider (dot) org
 *  
 * 
 *  Compile with
 *  + Linux:  $ gcc openwebspider-0.6.c -o openwebspider `mysql_config --cflags --libs` -lpthread -ldl -rdynamic -Wall
 *   - mysql-devel needed
 *  + Windows: Microsoft Visual C++ 6.0
 *
 *
 * Web Site: http://www.openwebspider.org/
 * 
 *
 * FAQ about Robots and Search engine here: http://www.robotstxt.org/wc/faq.html
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *
 */

#define AUTHOR          Shen139
#define VERSION         "0.6.1"
#define DBVERSION       1

#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <time.h>
#include <malloc.h>
#include <ctype.h>
#include <stdarg.h>
#include "regex.h"

#ifdef WIN32
  #include <process.h>
  #include <windows.h>
  #include "snprintf.c"

  #pragma comment(lib,"libmySQL.lib")
  /****************************************************************************/
  #include "mysql\mysql.h"
  /****************************************************************************/

#else /*linux*/
  #define _MULTI_THREADED
  #include <pthread.h>
  #include <sched.h>
  #include <sys/time.h>
  #include <unistd.h>
  #include <mysql/mysql.h>
  #include <pthread.h>
  #include <sys/types.h>
  #include <netinet/in.h>
  #include <netdb.h>
  #include <dlfcn.h>
  #include <string.h>
  #include <sys/socket.h>
  #include <arpa/inet.h>
#endif

#include "functions.h"

#include "mymutex.h"
#include "list.h"
#include "hstlist.h"
#include "htmlfnct.h"
#include "socket.h"
#include "sqlfnct.h"
#include "getopt.h"
#include "thread.h"
#include "misc.h"
#include "robots.h"
#include "rank.h"
#include "urlfunct.h"
#include "temptable.h"
#include "modules.h"
#include "sqlfnct.h"
#include "strfnct.h"
#include "server.h"
#include "search.h"
#include "parse_conf.h"
#include "indexer.h"

int usage(char *txt)
{
	printf("\n\nUsage: openwebspider [Arguments]\r\n");
	printf("Arguments:\r\n");
	printf("-I [Search string] (*) (Search the word or the words(between \"double-quotes\") from the database)\r\n");
	printf(" or\r\n");
	printf("-i [start url] (*) (Start indexing pages starting from passed url)\r\n");
	printf("-t [Number of threads] (Default: 20)\r\n");
	printf("-s (Single Host Mode)\r\n");
	printf("-m [Maximum level of depth in the tree of the pages] [Default: 0 (No limit)]\r\n");
	printf("-e (Doesn't Add External Host)\r\n");
	printf("-F (Free indexing mode)\r\n");
	printf("-l (Limits the maximum number of pages indexed per site) [Default: 0 (No limit)]\r\n");
	printf("-x (Saves a cache of the html page (full html)) (slow)\r\n");
	printf("-z (Saves a Compressed cache of the html page (full html)) (slow)\r\n");
	printf("-f [module] (Import loadable functions from the library)\r\n");
	printf("-X [eXtension(s)] (Set all the extensions that openwebspider must consider (eg. -X pdf,swf))\r\n");
	printf("-u (index only new pages (Update))\r\n");
	printf("-T (Testing Mode) No data (pages and rels) will be written into the DB\r\n");
	printf("-r [0-1-2](Saves relationships between pages (Default: 1))\r\n    0: doesn't save relationships\r\n    1: saves only relationships between hosts\r\n    2: saves all relationships (between hosts and pages)\r\n");
	printf("-n (No index pages) Don't index pages\r\n");
	printf("-d [0-%i ms (Crawl Delay)] (Default: 0)\r\n",MAXCRAWLDELAY*1000);
	printf("-S [TCP PORT] (Act as a server to get commands)\r\n");
    printf("-o [OpenWebSpider Own Index] (Build the OWS own index)\r\n");
/*	printf(" or\r\n");
	printf("-c [hostname] Calculate Page Rank for a Host\r\n");
*/
	printf("--\r\n-p [path] (specify the full path of the configuration file (eg.: \"/etc/openwebspider/openwebspider.conf\"))\r\n");
	printf("\r\n(*) Arguments needed\r\n");

	fprintf(stderr,"\r\n\r\nERROR: %s\r\n\r\n",txt);
	

exit(0);
}


void sigdie(int a)
{
	printf("\r\n\r\nCaught signal n.%i\r\n\r\n",a);
	if(a==15)
	{
		printf("\r\nExiting...\r\n");
		exit(0);
	}
	iQuit=1;

return;
}

int main(int argc, char*argv[])
{
struct sHost currentHst;
char starturl[MAXURLSIZE], *starturlTmp;
int c;
extern int optind;
char sUserQuery[MAXUSERQUERYSIZE];
char sConfFilePath[MAXURLSIZE];

	printf("OpenWebSpider(v%s)\r\n  Coded by Shen139\r\n   shen139(at)eviltime(dot)com\r\n\r\n",VERSION);

	if(argc<3)
		usage("Too few arguments");

	memset(starturl,0,MAXURLSIZE);
	memset(sConfFilePath,0,MAXURLSIZE);
	memset(iLastPing,0,sizeof(iLastPing));
	memset(lstRobotsExclusions,0,sizeof(lstRobotsExclusions));
	memset(&CustomExtensions,0,sizeof(CustomExtensions));
	bRobotsOK=0;

	while ((c = getopt(argc, argv, "IisrtmTelxRfXuzdFnSpo")) != -1)
	switch (c)
	{
		case 'I':				//indexed search
			if(scan_mode!=0)
				usage("(-I): Scan Mode redefinition");

			scan_mode=2;

			if(optind>=argc)
				usage("(-I): No enough arguments");

			if(strlen(argv[optind])>MAXUSERQUERYSIZE-1)
				usage("(-I): Query too long");
			else
			{
				strncpy(sUserQuery,argv[optind],MAXUSERQUERYSIZE-1);
				optind++;
			}

		break;
		case 'i':                   //Index pages
			if(scan_mode!=0)        //At startup scan_mode==0xFF => uninitialized
				usage("(-i): Scan Mode redefinition");

			scan_mode=1;

			if(optind>=argc)
				usage("(-i): No enough arguments");

			if(strlen(argv[optind])>MAXURLSIZE-1)
				usage("(-i): Url too long");
			else
			{
				strncpy(starturl,argv[optind],MAXURLSIZE-1);
				optind++;
			}

		break;
		case 'f':                   //Load library
			if(optind>=argc)
				usage("(-f): No enough arguments");

			if(strlen(argv[optind])>MAXPAGESIZE-1)
				usage("(-f): File name too long");
			else
			{
				myLoadModules(argv[optind],modHandler);
				optind++;
			}

		break;
		case 's':
			starthostonly=1;
		break;
		case 'r':                   //relationships

			if(optind>=argc)
				usage("(-r): No enough arguments");

			if(strcmp(argv[optind],"0")==0 || strcmp(argv[optind],"1")==0 || strcmp(argv[optind],"2")==0)
				nRelationships=atoi(argv[optind]);
			else
				usage("(-r): Range value 0,1,2");

			optind++;

		break;
		case 't':                   //n threads

			if(optind>=argc)
				usage("(-t): No enough arguments");

			nThread=atoi(argv[optind]);

			if(nThread>MAXTHREAD)
				usage("(-t): Too many threads");

			if(nThread<1)
				usage("(-t): At least one thread");

			optind++;

		break;
		case 'm':                    //maximum level of depth

			if(optind>=argc)
				usage("(-m): No enough arguments");

			maxDepthLevel=atoi(argv[optind]);

			if(maxDepthLevel<0)
				usage("(-m): Wrong level of depth");

			optind++;

		break;
		case 'l':					 //maximum pages per site
			if(optind>=argc)
				usage("(-l): No enough arguments");

			nMaxPagesPerSite=atoi(argv[optind]);

			if(nMaxPagesPerSite<0)
				usage("(-l): Wrong value for maximum number of pages per site");

			optind++;
		break;
		case 'x':					//Save HTML Cache
		case 'z':
			xCacheHtml=1;
			if(c=='z')
				xCacheHtmlCompressed=1;
		break;
		case 'S':                   //Act as a server

			if(optind>=argc)
				usage("(-S): No enough arguments");

			actAsAServerPort=atoi(argv[optind]);

			if(actAsAServerPort<1)
				usage("(-S): TCP PORT must be an integer");

			optind++;

		break;
		case 'p':                   //Path of openwebspider.conf
			if(optind>=argc)
				usage("(-p): No enough arguments");

			if(strlen(argv[optind])>MAXURLSIZE-1)
				usage("(-p): Path too long");
			else
			{
				strncpy(sConfFilePath,argv[optind],MAXURLSIZE-1);
				optind++;
			}

		break;
		case 'n':                    //Do not index pages
			bDontIndexPages=1;
		break;

		case 'T':                    //Test (doesn't write data to the DB)
			bTesting=1;
		break;
		case 'e':                    //Doesn't add external hosts
			bAddExternalHost=1;
		break;
		case 'u':                    //Update: index only new pages
			bUpdate=1;
		break;
		case 'F':                    //Free indexing mode
			bFreeIndexingMode=1;
		break;
   		case 'o':                    //OWS Own Index
			bBuildOwsOwnIndex=1;
    	break;
		case 'X':                   //Custom Extensions       (Under Construction)
			if(optind>=argc)
				usage("(-X): No enough arguments");

			if(strlen(argv[optind])>MAXCUSTOMEXTENSIONSIZE-1)
				usage("(-X): Custom extensions argument too long");
			else
			{
				/*split extensions*/
			char * pExt;
			int c=0;

				pExt = strtok (argv[optind],",");

				while (pExt != NULL)
				{
					if(c>MAXCUSTOMEXTENSIONS)
						break;
					
					if(strlen(pExt)<MAXEXTENSIONSIZE)
					{
						strcpy(CustomExtensions[c++],pExt);
					}
					pExt = strtok (NULL, ",");
				}

				optind++;
			}

		break;
		case 'd':                   //Crawl Delay

			if(optind>=argc)
				usage("(-d): No enough arguments");

			iCrawlDelay=atoi(argv[optind]);

			if(iCrawlDelay>MAXCRAWLDELAY*1000 || iCrawlDelay<0)
				usage("(-d): Wrong Crawl Delay");

			optind++;

		break;



/* OWS MACRO*//* case 'c':                    //Calculate Page Rank

			if(argc!=3)
				usage("(-c): Wrong number of arguments");

			if(strlen(argv[optind])>MAXHOSTSIZE-1)
				usage("(-c): Hostname too long");
			else
				strncpy(starturl,argv[optind],MAXHOSTSIZE-1);
			
			if(strnicmp(starturl,"http://",7)!=0)
			{
				starturlTmp=(char*)malloc(MAXHOSTSIZE);
				strncpy(starturlTmp,starturl,strlen(starturl)+7);
				sprintf(starturl, "http://%s",starturlTmp);
				FREE(starturlTmp);
			}

			if(ParseUrl(starturl,&currentHst,NULL)==-1)
				usage("(-c): Wrong start URL");

			printf("\r\nConnecting to Mysql server n.1 (%s)...",MYSQLSERVER1);			//Hosts
			if(sqlConnect(MYSQLSERVER1, USERDB1, PASSDB1, DB1,&gMysqlDB1)==0)
			{
				fprintf(stderr, "ERROR\r\nFailed to connect to database(%s): Error: %s\r\n",DB1,mysql_error(&gMysqlDB1));
				ERROR_LOG(mysql_error(&gMysqlDB1))
				return -1;
			}


			printf("\r\nConnecting to Mysql server n.2 (%s)...",MYSQLSERVER2);		//Pages
			if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2,&gMysqlDB2)==0)
			{
				fprintf(stderr, "ERROR\r\nFailed to connect to database(%s): Error: %s\r\n",DB2,mysql_error(&gMysqlDB2));

				ERROR_LOG(mysql_error(&gMysqlDB2))

				mysql_close(&gMysqlDB2);
				return -1;
			}

			init_mutex();

			CalcPageRank(currentHst.Host);

			mysql_close(&gMysqlDB1);

			return 1;
		break;
		*/

		default:
			usage("Unknown option argument");
	}

    /*
    CHECKs
    */
    if( (bBuildOwsOwnIndex == 1)  &&
       ((bDontIndexPages == 1)    ||
        (bTesting == 1)           ||
        (bUpdate == 1)    )        )
            usage("Wrong mix of arguments");

    /*
    Parse Config File
    */
	if(ReadConfFile(sConfFilePath)==0)
		return 0;

	/***************************************************************/

	if(scan_mode==0)
		usage("Scan mode undefined");

	/*********************************/

	if(scan_mode==2)
	{
	MYSQL mysql;

		printf("Scan Mode:       \tIndexed\r\n");
		printf("Query:           \t%s\r\n",sUserQuery);
		printf("Surfing the DB...\r\n");

		if(sqlConnect(MYSQLSERVER2, USERDB2, PASSDB2, DB2 , &mysql, MYSQLSERVER_PORT2)==0)
		{
			fprintf(stderr, "Failed to connect to database: Error: %s\n",mysql_error(&mysql));

		return 0;
		}


		return( IndexedSearch(&mysql,sUserQuery) );
	}

	/***********************/

	if(scan_mode==1)
	{
		starturlTmp=(char*)malloc(MAXURLSIZE);
		if(strnicmp(starturl,"http://",7)!=0)
		{
			strncpy(starturlTmp,starturl,strlen(starturl)+7);
			sprintf(starturl, "http://%s",starturlTmp);
		}


		unencode(starturl,starturl+strlen(starturl)+1,starturlTmp);
		strcpy(starturl,starturlTmp);

		FREE(starturlTmp);

		if(ParseUrl(starturl,&currentHst,NULL)==-1)
			usage("Wrong start URL");

		strncpy(currentHst.Description,starturl,MIN(strlen(starturl),MAXDESCRIPTIONSIZE-1));

		return InitIndexing(currentHst);
	}


return 1;
}

/*EOF*/
