robots.c (4202B)
1 /* See LICENSE file for copyright and license details. */ 2 3 #include <ctype.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 8 #include "robots.h" 9 #include "fetch.h" 10 #include "util.h" 11 12 /* 13 * Parse robots.txt content for our user-agent. 14 * 15 * We look for rules matching "sbot" first, 16 * then fall back to "*" (wildcard) rules. 17 */ 18 Robots * 19 robots_fetch(const char *domain) 20 { 21 Robots *r; 22 Response *resp; 23 char url[1024]; 24 char *line, *saveptr, *text; 25 int in_our_group, in_star_group, found_our_group; 26 int star_nrules, star_delay; 27 RobotsRule star_rules[MAX_RULES]; 28 29 r = xmalloc(sizeof(Robots)); 30 r->domain = xstrdup(domain); 31 r->nrules = 0; 32 r->crawl_delay = 0; 33 34 snprintf(url, sizeof(url), "https://%s/robots.txt", domain); 35 resp = fetch_url(url); 36 if (!resp || resp->status_code >= 400 || !resp->data) { 37 /* No robots.txt = everything allowed */ 38 response_free(resp); 39 return r; 40 } 41 42 text = xstrdup(resp->data); 43 response_free(resp); 44 45 in_our_group = 0; 46 in_star_group = 0; 47 found_our_group = 0; 48 star_nrules = 0; 49 star_delay = 0; 50 51 for (line = strtok_r(text, "\n", &saveptr); line; 52 line = strtok_r(NULL, "\n", &saveptr)) { 53 char *trimmed, *colon, *key, *val; 54 55 trimmed = str_trim(line); 56 57 /* Skip empty lines and comments */ 58 if (!*trimmed || *trimmed == '#') 59 continue; 60 61 /* Strip inline comments */ 62 colon = strchr(trimmed, '#'); 63 if (colon) 64 *colon = '\0'; 65 66 /* Find key: value */ 67 colon = strchr(trimmed, ':'); 68 if (!colon) 69 continue; 70 *colon = '\0'; 71 key = str_trim(trimmed); 72 val = str_trim(colon + 1); 73 74 if (strcasecmp(key, "user-agent") == 0) { 75 /* New user-agent group */ 76 if (strcasestr(val, "sbot")) { 77 in_our_group = 1; 78 in_star_group = 0; 79 found_our_group = 1; 80 } else if (strcmp(val, "*") == 0 && 81 !found_our_group) { 82 in_star_group = 1; 83 in_our_group = 0; 84 } else { 85 in_our_group = 0; 86 in_star_group = 0; 87 } 88 continue; 89 } 90 91 if (strcasecmp(key, "disallow") == 0) { 92 if (in_our_group && r->nrules < MAX_RULES) { 93 r->rules[r->nrules].path = xstrdup(val); 94 r->rules[r->nrules].allow = 0; 95 r->nrules++; 96 } else if (in_star_group && 97 star_nrules < MAX_RULES) { 98 star_rules[star_nrules].path = xstrdup(val); 99 star_rules[star_nrules].allow = 0; 100 star_nrules++; 101 } 102 } else if (strcasecmp(key, "allow") == 0) { 103 if (in_our_group && r->nrules < MAX_RULES) { 104 r->rules[r->nrules].path = xstrdup(val); 105 r->rules[r->nrules].allow = 1; 106 r->nrules++; 107 } else if (in_star_group && 108 star_nrules < MAX_RULES) { 109 star_rules[star_nrules].path = xstrdup(val); 110 star_rules[star_nrules].allow = 1; 111 star_nrules++; 112 } 113 } else if (strcasecmp(key, "crawl-delay") == 0) { 114 int delay = atoi(val); 115 116 if (delay > 0) { 117 if (in_our_group) 118 r->crawl_delay = delay; 119 else if (in_star_group) 120 star_delay = delay; 121 } 122 } 123 } 124 125 /* If no specific rules for us, use wildcard rules */ 126 if (!found_our_group && star_nrules > 0) { 127 int i; 128 129 for (i = 0; i < star_nrules; i++) 130 r->rules[i] = star_rules[i]; 131 r->nrules = star_nrules; 132 r->crawl_delay = star_delay; 133 } else if (!found_our_group) { 134 /* Free star rules if we didn't use them */ 135 int i; 136 137 for (i = 0; i < star_nrules; i++) 138 free(star_rules[i].path); 139 } 140 141 free(text); 142 return r; 143 } 144 145 int 146 robots_allowed(Robots *r, const char *path) 147 { 148 int i, best_len, allowed; 149 150 if (!r || r->nrules == 0) 151 return 1; 152 153 /* 154 * Match the most specific (longest) rule. 155 * If multiple rules of same length, Allow wins. 156 */ 157 best_len = -1; 158 allowed = 1; 159 160 for (i = 0; i < r->nrules; i++) { 161 int plen = strlen(r->rules[i].path); 162 163 /* Empty disallow = allow all */ 164 if (plen == 0 && !r->rules[i].allow) 165 continue; 166 167 if (strncmp(path, r->rules[i].path, plen) == 0) { 168 if (plen > best_len) { 169 best_len = plen; 170 allowed = r->rules[i].allow; 171 } else if (plen == best_len && 172 r->rules[i].allow) { 173 allowed = 1; 174 } 175 } 176 } 177 178 return allowed; 179 } 180 181 int 182 robots_delay(Robots *r) 183 { 184 if (!r) 185 return 0; 186 return r->crawl_delay; 187 } 188 189 void 190 robots_free(Robots *r) 191 { 192 int i; 193 194 if (!r) 195 return; 196 for (i = 0; i < r->nrules; i++) 197 free(r->rules[i].path); 198 free(r->domain); 199 free(r); 200 }