sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

robots.c (4202B)


      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #include <ctype.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <string.h>
      7 
      8 #include "robots.h"
      9 #include "fetch.h"
     10 #include "util.h"
     11 
     12 /*
     13  * Parse robots.txt content for our user-agent.
     14  *
     15  * We look for rules matching "sbot" first,
     16  * then fall back to "*" (wildcard) rules.
     17  */
     18 Robots *
     19 robots_fetch(const char *domain)
     20 {
     21 	Robots *r;
     22 	Response *resp;
     23 	char url[1024];
     24 	char *line, *saveptr, *text;
     25 	int in_our_group, in_star_group, found_our_group;
     26 	int star_nrules, star_delay;
     27 	RobotsRule star_rules[MAX_RULES];
     28 
     29 	r = xmalloc(sizeof(Robots));
     30 	r->domain = xstrdup(domain);
     31 	r->nrules = 0;
     32 	r->crawl_delay = 0;
     33 
     34 	snprintf(url, sizeof(url), "https://%s/robots.txt", domain);
     35 	resp = fetch_url(url);
     36 	if (!resp || resp->status_code >= 400 || !resp->data) {
     37 		/* No robots.txt = everything allowed */
     38 		response_free(resp);
     39 		return r;
     40 	}
     41 
     42 	text = xstrdup(resp->data);
     43 	response_free(resp);
     44 
     45 	in_our_group = 0;
     46 	in_star_group = 0;
     47 	found_our_group = 0;
     48 	star_nrules = 0;
     49 	star_delay = 0;
     50 
     51 	for (line = strtok_r(text, "\n", &saveptr); line;
     52 	     line = strtok_r(NULL, "\n", &saveptr)) {
     53 		char *trimmed, *colon, *key, *val;
     54 
     55 		trimmed = str_trim(line);
     56 
     57 		/* Skip empty lines and comments */
     58 		if (!*trimmed || *trimmed == '#')
     59 			continue;
     60 
     61 		/* Strip inline comments */
     62 		colon = strchr(trimmed, '#');
     63 		if (colon)
     64 			*colon = '\0';
     65 
     66 		/* Find key: value */
     67 		colon = strchr(trimmed, ':');
     68 		if (!colon)
     69 			continue;
     70 		*colon = '\0';
     71 		key = str_trim(trimmed);
     72 		val = str_trim(colon + 1);
     73 
     74 		if (strcasecmp(key, "user-agent") == 0) {
     75 			/* New user-agent group */
     76 			if (strcasestr(val, "sbot")) {
     77 				in_our_group = 1;
     78 				in_star_group = 0;
     79 				found_our_group = 1;
     80 			} else if (strcmp(val, "*") == 0 &&
     81 			           !found_our_group) {
     82 				in_star_group = 1;
     83 				in_our_group = 0;
     84 			} else {
     85 				in_our_group = 0;
     86 				in_star_group = 0;
     87 			}
     88 			continue;
     89 		}
     90 
     91 		if (strcasecmp(key, "disallow") == 0) {
     92 			if (in_our_group && r->nrules < MAX_RULES) {
     93 				r->rules[r->nrules].path = xstrdup(val);
     94 				r->rules[r->nrules].allow = 0;
     95 				r->nrules++;
     96 			} else if (in_star_group &&
     97 			           star_nrules < MAX_RULES) {
     98 				star_rules[star_nrules].path = xstrdup(val);
     99 				star_rules[star_nrules].allow = 0;
    100 				star_nrules++;
    101 			}
    102 		} else if (strcasecmp(key, "allow") == 0) {
    103 			if (in_our_group && r->nrules < MAX_RULES) {
    104 				r->rules[r->nrules].path = xstrdup(val);
    105 				r->rules[r->nrules].allow = 1;
    106 				r->nrules++;
    107 			} else if (in_star_group &&
    108 			           star_nrules < MAX_RULES) {
    109 				star_rules[star_nrules].path = xstrdup(val);
    110 				star_rules[star_nrules].allow = 1;
    111 				star_nrules++;
    112 			}
    113 		} else if (strcasecmp(key, "crawl-delay") == 0) {
    114 			int delay = atoi(val);
    115 
    116 			if (delay > 0) {
    117 				if (in_our_group)
    118 					r->crawl_delay = delay;
    119 				else if (in_star_group)
    120 					star_delay = delay;
    121 			}
    122 		}
    123 	}
    124 
    125 	/* If no specific rules for us, use wildcard rules */
    126 	if (!found_our_group && star_nrules > 0) {
    127 		int i;
    128 
    129 		for (i = 0; i < star_nrules; i++)
    130 			r->rules[i] = star_rules[i];
    131 		r->nrules = star_nrules;
    132 		r->crawl_delay = star_delay;
    133 	} else if (!found_our_group) {
    134 		/* Free star rules if we didn't use them */
    135 		int i;
    136 
    137 		for (i = 0; i < star_nrules; i++)
    138 			free(star_rules[i].path);
    139 	}
    140 
    141 	free(text);
    142 	return r;
    143 }
    144 
    145 int
    146 robots_allowed(Robots *r, const char *path)
    147 {
    148 	int i, best_len, allowed;
    149 
    150 	if (!r || r->nrules == 0)
    151 		return 1;
    152 
    153 	/*
    154 	 * Match the most specific (longest) rule.
    155 	 * If multiple rules of same length, Allow wins.
    156 	 */
    157 	best_len = -1;
    158 	allowed = 1;
    159 
    160 	for (i = 0; i < r->nrules; i++) {
    161 		int plen = strlen(r->rules[i].path);
    162 
    163 		/* Empty disallow = allow all */
    164 		if (plen == 0 && !r->rules[i].allow)
    165 			continue;
    166 
    167 		if (strncmp(path, r->rules[i].path, plen) == 0) {
    168 			if (plen > best_len) {
    169 				best_len = plen;
    170 				allowed = r->rules[i].allow;
    171 			} else if (plen == best_len &&
    172 			           r->rules[i].allow) {
    173 				allowed = 1;
    174 			}
    175 		}
    176 	}
    177 
    178 	return allowed;
    179 }
    180 
    181 int
    182 robots_delay(Robots *r)
    183 {
    184 	if (!r)
    185 		return 0;
    186 	return r->crawl_delay;
    187 }
    188 
    189 void
    190 robots_free(Robots *r)
    191 {
    192 	int i;
    193 
    194 	if (!r)
    195 		return;
    196 	for (i = 0; i < r->nrules; i++)
    197 		free(r->rules[i].path);
    198 	free(r->domain);
    199 	free(r);
    200 }