/* OpenWebSpider
*
*  Author:     Stefano Alimonti aka Shen139
*  Mail:       shen139 [at] openwebspider (dot) org
*
*
* This file is part of OpenWebSpider
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
*/

#ifndef __INDEXER
#define __INDEXER


/* DEFAULT: MySQL FULL-TEXT Index */

int IndexPage(char* html, struct sHost host, unsigned int htmlLength)
{
	char *cTmp;
	char *pureText;
	char tmpTitle[MAXDESCRIPTIONSIZE], title[MAXDESCRIPTIONSIZE];
	char sanHostname[MAXHOSTSIZE];
	char sanPage[MAXPAGESIZE];
	int usetitle=0;
	char* sqlQuery;
	char *htmlcache=NULL;
	unsigned int textLength;

    int (*modFilter)(struct functArg*);
	
	if( bTesting==1 || bDontIndexPages==1)
		return 1;
	
	if( bUpdate==1 )	//-u ?
		if( IsPageIndexed(&host)==1 )	//Is this page Indexed ?
			return 0;		//Yes, don't re-index
		
	cTmp = (char*)malloc(MAXPACKETSIZE);
	pureText = (char*)malloc(MAXPACKETSIZE);
	
	
	// are we saving a cache? yes: prepare it
	htmlcache = NULL;
	if(xCacheHtml==1)
	{
		htmlcache=(char*)malloc( ( strlen(html) + 1 ) * 2 );
		
		mysql_real_escape_string(&gMysqlDB3, htmlcache, html, strlen(html));
	}
	
	if(cTmp==NULL || pureText==NULL)
		MemoryCorruptedHandler("IndexPage");
	
	if(host.type==1)    //HTML
	{
		if(BetweenTag(html, "title",tmpTitle ,1,MAXDESCRIPTIONSIZE)>0)
		{
			memset(title,0,MAXDESCRIPTIONSIZE);
			snprintf(title,MAXDESCRIPTIONSIZE-1,"%s",tmpTitle+1);
			usetitle=1;
		}
		
		textLength=UnHtml(html,cTmp,MAXPACKETSIZE);
		
		/* if sqlTextToUTF8 doesn't complete its work (for example for text too long)
		it store the text as returned by UnHtml */
		if(sqlTextToUTF8(cTmp,pureText,MAXPACKETSIZE)==0)
			strcpy(pureText,cTmp);
		
	}
	else
    if(host.type==2)            //Plain text files
	{
		RemoveShit(html);
		OnlyOneSpace(html,pureText,MAXPACKETSIZE);
		textLength=strlen(pureText);
		
	}
    else
    if(host.type==4)            //Custom handled files
	{
        //we empty pureText so the module can fill it with the text of the page
		memset(pureText,0,MAXPACKETSIZE);
		textLength=0;
	}
    else
    {
        /* impossible :-) */
	FREE(cTmp);
	FREE(pureText);
	FREE(htmlcache);
		
	return 0;
    }

	//are we using a regular expression filter?
	if( bUseRegularExpressionB == 1 )
	{	//yes
		if(regexec(&regexContentFilter, pureText, 0, 0, 0) != 0)
		{
			FREE(cTmp);
			FREE(pureText);
			FREE(htmlcache);
			
			return 0;
		}
	}	//else continue


    if( (modFilter = GetModFunctionHandlerByName("modFilter")) )
	{	//we are using a custom function as filter
		struct functArg tmpModArg;
		
		tmpModArg.hostInfo = &host;
		tmpModArg.html = html;
		
		tmpModArg.htmlLength = htmlLength;
		
		tmpModArg.text = pureText;
		
		tmpModArg.textLength = textLength;
		
		tmpModArg.mysqlDB1 = &gMysqlDB1;
		tmpModArg.mysqlDB2 = &gMysqlDB2;
		tmpModArg.mysqlDB3 = &gMysqlDB3;
		
		thrdBlock(BLOCKDB1);
		thrdBlock(BLOCKINDEX);
		
		if(modFilter(&tmpModArg)==0)
		{
			thrdUnBlock(BLOCKDB1);
			thrdUnBlock(BLOCKINDEX);
			FREE(cTmp);
			FREE(pureText);
			FREE(htmlcache);
			
			return 0;
		}
		/*else index*/
		
		thrdUnBlock(BLOCKDB1);
		thrdUnBlock(BLOCKINDEX);
	}

    /* here we have text that could be dirty so we must clean it*/
    if(host.type==4)
        RemoveShit(pureText);

	sqlQuery = malloc(MAXQUERYSIZE);
	
	if(sqlQuery==NULL)
		MemoryCorruptedHandler("IndexPage");
	
	
	(usetitle==1) ? RemoveShit(title):RemoveShit(host.Description);
	
	mysql_real_escape_string(&gMysqlDB3, sanHostname, host.Host, strlen(host.Host));
	mysql_real_escape_string(&gMysqlDB3, sanPage, host.Page, strlen(host.Page));
	
	memset(sqlQuery,0,MAXQUERYSIZE);
	snprintf(sqlQuery,MAXQUERYSIZE,"INSERT DELAYED INTO %s SET hostname = \'%s\',page=\'%s\',description=\'%s\',date=curdate(),time=curtime(),version=%i,level=%i,html=CONCAT(\'%s\') ;",gTable,sanHostname,sanPage,(usetitle==1) ? title: host.Description,DBVERSION,host.level,pureText);
	
	if(sqlQuery[MAXQUERYSIZE-3]!=0)
	{
		sqlQuery[MAXQUERYSIZE-3]='\'';
		sqlQuery[MAXQUERYSIZE-2]=';';
		sqlQuery[MAXQUERYSIZE-1]=0;
	}
	
	my_mysql_ping(&gMysqlDB3,BLOCKINDEX);
	if(my_mysql_query(&gMysqlDB3, sqlQuery, BLOCKINDEX))
	{
		ERROR_LOG(mysql_error(&gMysqlDB3))
			ERROR_LOG(sqlQuery)
			printf("\r\nQuery Error in function IndexPage(): %s\r\n",mysql_error(&gMysqlDB3));
		printf("Trying to reconnect to server...");
		printf("OK\r\nConnecting to Mysql server n.3 (%s)...",MYSQLSERVER3);
		if(sqlConnect(MYSQLSERVER3, USERDB3, PASSDB3, DB3,&gMysqlDB3, MYSQLSERVER_PORT3)==0)
		{
			printf("ERROR\r\n");
			iQuit=1;
			
			ERROR_LOG(mysql_error(&gMysqlDB3))
				
				FREE(cTmp);
			FREE(pureText);
			FREE(sqlQuery);
			FREE(htmlcache);
			
			return -1;
		}
		printf("OK\r\n");
	}
	
	/* this page is indexed correctly */
	
	FREE(cTmp);
	FREE(pureText);
	
	if(htmlcache && xCacheHtml==1)	//saves html cache
	{
		if(htmlcache==NULL)
			MemoryCorruptedHandler("IndexPage");
		
		if(xCacheHtmlCompressed==1)
			sprintf(sqlQuery,"UPDATE %s SET htmlcache=COMPRESS('%s') WHERE hostname='%s' and page='%s';",gTable,htmlcache,sanHostname,sanPage);
		else
			sprintf(sqlQuery,"UPDATE %s SET htmlcache='%s' WHERE hostname='%s' and page='%s';",gTable,htmlcache,sanHostname,sanPage);
		
		if(my_mysql_query(&gMysqlDB3, sqlQuery, BLOCKINDEX))
		{
			ERROR_LOG(mysql_error(&gMysqlDB3))
				printf("\r\nQuery Error in function IndexPage(): %s\r\n",mysql_error(&gMysqlDB3));
		}
		
		FREE(htmlcache);
	}
	
	FREE(sqlQuery);
	
	return 1;
}


/* IsPageIndexed
*  if the page exists returns its id
*  else returns 0
*/
int IsPageIndexed(struct sHost* host)
{
	char* sqlQuery;
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	int ret=1;
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
	if(tmpRes==NULL || sqlQuery==NULL)
		MemoryCorruptedHandler("IsPageIndexed");
	
	sprintf(sqlQuery,"SELECT idpage FROM pagelist WHERE hostname='%s' AND page='%s' LIMIT 1",host->Host, host->Page);
	my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);
	
	if(mysql_affected_rows(&gMysqlDB2)==0)	//Page is not indexed -> return 0
		ret = 0;
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);
	FREE(sqlQuery);
	
	return ret;
}

/* Takes a Text and convert all special characters to UTF-8 */
int sqlTextToUTF8(char* text, char* out, int maxout)
{
	int x,y;
	int textLen;
	unsigned char curC;
	char* aass;
	char ssaa[10];
	int cont;
	int bAscFound;
	
	textLen=strlen(text);
	
	memset(out,0,maxout);
	
	y=0;
	
	//out[y++]='\'';
	
	for(x=0;x<textLen;x++)
	{
		curC=text[x];
		if(curC=='&')
		{
			bAscFound=0;
			
			aass=strchr(text+x,';');
			if(aass && aass-(text+x) < 10)
			{
				memset(ssaa,0,10);
				strncpy(ssaa,text+x+1, (aass-(text+x))-1 );
				
				if(ssaa[0]=='#')
				{
					char val[10];
					if(ssaa[1]=='x')
						strcpy(val,ssaa+2);
					else
						sprintf(val,"%X",atoi(ssaa+1));
					
					if(y+21+strlen(val)+26<maxout)
					{
						strcat(out,"', CONVERT(CONVERT(0x");		/* 21           + */
						strcat(out,val);							/* strlen(utf8) + */
						strcat(out," using UCS2) using UTF8),'");	/* 26           = */
						y+=21+strlen(val)+26;
						
						bAscFound=1;
					}
					else
						return 0;
					
					x+=strlen(ssaa)+1;
					continue;					
				}
				
				cont=0;
				
				while(ahList[cont].htmlChar && bAscFound==0 )
				{
					if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )
					{
						if(ahList[cont].type==1)	/*ascii*/
						{
							if(y+strlen(ahList[cont].rep)<maxout)
							{
								strcat(out,ahList[cont].rep);
								y+=strlen(ahList[cont].rep);
								bAscFound=1;
							}
							else
								return 0;
						}
						else						/*UTF8*/
						{
							if(y+11+strlen(ahList[cont].rep)+14<maxout)
							{
								strcat(out,"', CONVERT(");			/* 11           + */
								strcat(out,ahList[cont].rep);		/* strlen(utf8) + */
								strcat(out," using UTF8),'");		/* 14           = */
								y+=11+strlen(ahList[cont].rep)+14;
								bAscFound=1;
							}
							else
								return 0;
						}
						x+=strlen(ssaa)+1;
						
					}	/*if( strcmp( ahList[cont].htmlChar, ssaa ) == 0 )*/
					
					cont++;
					
				}	/*while*/
				
			}	/*if(aass && aass-(text+x) < 10)*/
			
			if(bAscFound==0)
				out[y++]='&';
			
		}
		else
			out[y++]=curC;
	}
	
	return 1;
}

/*****************************************************************************************/
/* TESTING: ows own index */


int BuildOwsOwnIndex(struct sHost host)
{
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	MYSQL_ROW row;
	char* sqlQuery;
	OOI_NODE* lexicon;
	unsigned int res_elements, counter = 0;
	
	lexicon_number_of_elements = 0;
	lexicon_actual_size = LEXICONWORDSIZE;
	lexicon = InitLexicon();
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	sqlQuery = malloc(MAXQUERYSIZE);
	
	snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT idpage, html FROM pagelist WHERE hostname =\'%s\' ",host.Host);
	
	my_mysql_query_and_store_results(&gMysqlDB2,sqlQuery,tmpRes,&gRes,NO_BLOCK);
	
	FREE(sqlQuery);

    res_elements = mysql_affected_rows(&gMysqlDB2);

	printf("Building OpenWebSpider Own Index (0 docs of %i)...          ", res_elements);

	fflush(stdout);
	
	
	while( (row = mysql_fetch_row(&gRes)) )
	{
		IndexPage2((char*)row[1], atoi(row[0]),&lexicon);
        counter++;
        if(counter % 10 == 0 || counter == res_elements)
            printf("\rBuilding OpenWebSpider Own Index (%i docs of %i)...          ", counter, res_elements);
	}
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);

	printf("\r\n");

	StoreOwsIndex(lexicon);

	FreeOwsIndex(lexicon);

	printf("\r\n");

	return 1;
}


int IndexPage2(char* text, unsigned int page_id, OOI_NODE** lexicon)
{
	char* pCh = NULL;
	unsigned int wordLen;
	unsigned int position = 0;
	
	/* step 1: we split all tokens */
	pCh = strtok (text,INDEXERTOKENS);
	
	if(pCh==NULL || pCh[0]==0)
		return 0;
	
	while(pCh != NULL)
	{
		wordLen = strlen(pCh);
		if(wordLen>OWSINDEXMINWORDSIZE && wordLen<OWSINDEXMAXWORDSIZE)
		{
			if(ndzLookForWord(*lexicon,_strupr(pCh))==-1)	//Add unique word
				lstAddWord(lexicon,pCh);
			
			UpdateInvertedIndex(*lexicon, pCh,page_id, position);
			
			position ++ ;
		}

		pCh = strtok (NULL, INDEXERTOKENS);
	}
	
	return 1;
}

void UpdateInvertedIndex(OOI_NODE* lexicon, char* word, unsigned int doc_id, unsigned int position)
{
	int pos;
	INVERTED_INDEX* ii;
	INVERTED_INDEX* last;
	
	pos = ndzLookForWord(lexicon, word);
	
	if(pos==-1)
		return;
	
	ii = lexicon[pos].ii;
	
	last = ii->last;
	
	last->next = malloc(sizeof(INVERTED_INDEX));
	last->next->doc_id = doc_id;
	last->next->position = position;
	last->next->next = NULL;
	
	ii->last = last->next;
}

/* GetWordId
*  if the page exists returns its id
*  else returns 0
*/
int GetWordId(char* word)
{
	char* sqlQuery;
	MYSQL_RES gRes;
	MYSQL_RES** tmpRes=NULL;
	MYSQL_ROW row;
	unsigned int ret;
	
	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));
	
	sqlQuery = malloc(MAXQUERYSIZE);
	
	if(tmpRes==NULL || sqlQuery==NULL)
		MemoryCorruptedHandler("GetWordId");
	
	snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.wordlist WHERE word='%s' LIMIT 1", DB2, word);
	
	my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);
	
	FREE(sqlQuery);
	
	row = mysql_fetch_row(&gRes);
	
	if(row)
		ret = atoi(row[0]);
	else
		ret = 0;
	
	
	if(*tmpRes)
	{
		mysql_free_result(*tmpRes);
	}
	
	FREE(tmpRes);
	
	
	return ret;
}

void StoreOwsIndex(OOI_NODE* lexicon)
{
	MYCSTR AsqlQuery;
	unsigned int i;
	INVERTED_INDEX* ii;
	char* sqlQuery;
	unsigned int word_id;
	char strTmp[50];
	unsigned int isFirst;

	/*init*/
	AsqlQuery.myString=NULL;

	sqlQuery = malloc(MAXQUERYSIZE);

	my_mysql_ping(&gMysqlDB2,BLOCKINDEX);
	
	for(i=0;i<lexicon_number_of_elements;i++)
	{
		//printf("\n%i -- %s\n",lexicon[i].id, lexicon[i].field);

		if( (i+1) % 10 == 0 || i == lexicon_number_of_elements-1)
			printf("\rStoring OpenWebSpider Index to the DB(%i words of %i)...          ",i+1 , lexicon_number_of_elements);
		
		//Add word (the table has an unique index on the field word)
		snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO %s.wordlist (word) VALUES('%s')", DB2, lexicon[i].field);
		
		my_mysql_query(&gMysqlDB2, sqlQuery, NO_BLOCK);
		
		/* *** */
		
		word_id = GetWordId(lexicon[i].field);
		
		/* is the word in the DB? */
		if(word_id > 0)
		{
			myCStrCpy(&AsqlQuery, "INSERT INTO ");
			myCStrCat(&AsqlQuery, DB2);
			myCStrCat(&AsqlQuery, ".ii (wordid, pageid, position) VALUES");
			isFirst = 1;

			ii = lexicon[i].ii;
			while(ii != NULL)
			{
				if(ii->doc_id>0)
				{
					if(isFirst)
					{
						snprintf(strTmp,50,"(%i,%i,%i)",word_id, ii->doc_id, ii->position );
						isFirst=0;
					}
					else
						snprintf(strTmp,50,",(%i,%i,%i)",word_id, ii->doc_id, ii->position );

					myCStrCat(&AsqlQuery, strTmp);
				}
				ii = ii->next;
			}

			if(my_mysql_query(&gMysqlDB2, AsqlQuery.myString, NO_BLOCK))
			{
				ERROR_LOG(mysql_error(&gMysqlDB2))
				ERROR_LOG(AsqlQuery.myString)
				printf("\r\nQuery Error in function StoreOwsIndex(): %s\r\n",mysql_error(&gMysqlDB2));
			}
		
		}
	}
	
	FREE(sqlQuery);
	FREE(AsqlQuery.myString);
}

#endif

/*EOF*/

