
/* OpenWebSpider
 *
 *  Authors:     Stefano Alimonti AND Stefano Fantin
 *  Version:     0.7
 *  E-Mails:     shen139 [at] openwebspider (dot) org AND stefanofantinguz@yahoo.it
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */


/* robots.txt example file

	User-agent: *
	Disallow: /cgi-bin/
	Disallow: /tmp/
	Disallow: /private/

	User-agent : OpenWebSpider
	Crawl-Delay: 5
	Disallow: /private/

*/

#ifndef __ROBOTSTXT
#define __ROBOTSTXT


int ParseRobotsTxt(char* html,struct sHost host)
{
char* pCur;
char* pCRLF;
int i;
int iHtmlLen;
char sDisallow[MAXPAGESIZE];
char sTrimDisallow[MAXPAGESIZE];
struct sHost locHost;
int pos=0;

	printf(" + Parsing robots.txt\r\n");
	_strlwr(html);

	ReplaceChr(html,'\t',' ');
	OnlyOneSpace(html,html,MAXPACKETSIZE);
 
	if((pCur=my_stristr(html,"user-agent: openwebspider"))<html)
		pCur=my_stristr(html,"user-agent: *");

	if(pCur<html)
		return 0;

	pCur+=13;
	iHtmlLen=strlen(pCur);

	for(i=0;i<iHtmlLen;i++)
	{
		if(pos==MAXDISALLOW)
			return 1;

		if(strnicmp(pCur+i,"disallow:",9)==0)
		{
			pCRLF=strstr(pCur+i,"\r\n");

			if(pCRLF==NULL)
				pCRLF=strstr(pCur+i,"\n");

			if(strchr(pCur+i,'#')>pCur+i && strchr(pCur+i,'#')<pCRLF)
				pCRLF=strchr(pCur+i,'#');

			if(pCRLF && pCRLF-(pCur+i+10)>0 && pCRLF-(pCur+i+10)<MAXPAGESIZE-1 && pos<MAXDISALLOW)
			{
				memset(sDisallow,0,MAXPAGESIZE);
				strncpy(sDisallow,pCur+i+10,pCRLF-(pCur+i+10));

				if(ParseUrl(strtrim(sDisallow,sTrimDisallow),&locHost,&host)==-1)
				{
					i+=(pCRLF-(pCur+i));
					continue;
				}

				strcpy(lstRobotsExclusions[pos++],locHost.Page);
				printf("   - Disallow: %s\r\n",locHost.Page);

				i+=(pCRLF-(pCur+i));
			}
		
		}
		else if(strnicmp(pCur+i,"crawl-delay:",12)==0)
		{
			pCRLF=strstr(pCur+i,"\r\n");

			if(pCRLF==NULL)
				pCRLF=strstr(pCur+i,"\n");

			if(strchr(pCur+i,'#')>pCur+i && strchr(pCur+i,'#')<pCRLF)
				pCRLF=strchr(pCur+i,'#');

			/*TODO: trim(page)*/
			if(pCRLF && pCRLF-(pCur+i+13)>0 && pCRLF-(pCur+i+13)<4-1 && pos<4)
			{
			char sTmpCrawlDelay[5];

				memset(sTmpCrawlDelay,0,sizeof(sTmpCrawlDelay));
				strncpy(sTmpCrawlDelay,pCur+i+13,pCRLF-(pCur+i+13));
				iRobCrawlDelay=atoi(sTmpCrawlDelay);

				printf("   - Crawl Delay: %i\r\n", iRobCrawlDelay);

				i+=(pCRLF-(pCur+i));
			}
		}
		else if(strnicmp(pCur+i,"user-agent:",11)==0)
			return 1;
	}

return 1;
}

int CheckRobotExclusion(char* page)
{
int i;
	for(i=0;i<MAXDISALLOW && lstRobotsExclusions[i][0]!=0;i++)
	{
		if(strlen(page)>=strlen(lstRobotsExclusions[i]))
			if(strnicmp(lstRobotsExclusions[i],page,strlen(lstRobotsExclusions[i]))==0)
				return 0;
	}
return 1;
}

#endif


/*EOF*/


