sparser.c (6009B)
1 /* See LICENSE file for copyright and license details. 2 * 3 * sparser - Simple Parser 4 * 5 * Extracts external URLs from text files. 6 * Supports HTML, Markdown, MDX, plain text. 7 * Can recursively walk directories. 8 */ 9 10 #include <dirent.h> 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 #include <sys/stat.h> 15 #include <unistd.h> 16 17 #include "config.h" 18 #include "extract.h" 19 #include "util.h" 20 21 /* Hash table for URL deduplication */ 22 #define DEDUP_SIZE 65521 23 24 typedef struct DeNode { 25 char *url; 26 struct DeNode *next; 27 } DeNode; 28 29 /* Global options */ 30 static int verbose = 0; 31 static int recurse = 0; 32 static int dedup = 0; 33 static DeNode *dedup_table[DEDUP_SIZE]; 34 35 static void 36 usage(void) 37 { 38 fprintf(stderr, 39 "usage: sparser [-vuR] [path | -]\n" 40 "\n" 41 " -v verbose (print filenames to stderr)\n" 42 " -u deduplicate URLs\n" 43 " -R recursive directory scan\n" 44 "\n" 45 " path file or directory to scan\n" 46 " - read from stdin\n"); 47 exit(1); 48 } 49 50 /* FNV-1a hash */ 51 static unsigned long 52 fnv1a(const char *s) 53 { 54 unsigned long h = 2166136261UL; 55 56 for (; *s; s++) { 57 h ^= (unsigned char)*s; 58 h *= 16777619UL; 59 } 60 return h; 61 } 62 63 static int 64 dedup_seen(const char *url) 65 { 66 unsigned long h; 67 DeNode *n; 68 69 h = fnv1a(url) % DEDUP_SIZE; 70 for (n = dedup_table[h]; n; n = n->next) { 71 if (strcmp(n->url, url) == 0) 72 return 1; 73 } 74 return 0; 75 } 76 77 static void 78 dedup_add(const char *url) 79 { 80 unsigned long h; 81 DeNode *n; 82 83 h = fnv1a(url) % DEDUP_SIZE; 84 n = xmalloc(sizeof(DeNode)); 85 n->url = xstrdup(url); 86 n->next = dedup_table[h]; 87 dedup_table[h] = n; 88 } 89 90 static void 91 dedup_free(void) 92 { 93 size_t i; 94 DeNode *n, *next; 95 96 for (i = 0; i < DEDUP_SIZE; i++) { 97 for (n = dedup_table[i]; n; n = next) { 98 next = n->next; 99 free(n->url); 100 free(n); 101 } 102 } 103 } 104 105 /* Callback for each extracted URL */ 106 static void 107 url_found(const char *url, void *ctx) 108 { 109 (void)ctx; 110 111 if (dedup) { 112 if (dedup_seen(url)) 113 return; 114 dedup_add(url); 115 } 116 117 puts(url); 118 } 119 120 /* Check if a filename has a text-like extension */ 121 static int 122 is_text_ext(const char *name) 123 { 124 /* Common text extensions we want to process */ 125 static const char *exts[] = { 126 ".html", ".htm", ".xhtml", 127 ".md", ".mdx", ".markdown", 128 ".txt", ".text", ".rst", 129 ".xml", ".rss", ".atom", 130 ".json", ".yaml", ".yml", 131 ".css", ".js", ".jsx", ".ts", ".tsx", 132 ".org", ".adoc", ".tex", ".bib", 133 ".csv", ".tsv", 134 ".cfg", ".conf", ".ini", 135 ".sh", ".bash", ".zsh", ".fish", 136 ".py", ".rb", ".pl", ".c", ".h", 137 ".go", ".rs", ".java", ".hs", 138 NULL 139 }; 140 int i; 141 142 for (i = 0; exts[i]; i++) { 143 if (str_ends_with(name, exts[i])) 144 return 1; 145 } 146 147 /* Files without extension (README, LICENSE, etc.) */ 148 if (!strchr(name, '.')) 149 return 1; 150 151 return 0; 152 } 153 154 /* Read entire file into memory. Returns NULL on error. */ 155 static char * 156 read_file(const char *path, size_t *out_len) 157 { 158 FILE *fp; 159 char *data; 160 long fsize; 161 162 if (strcmp(path, "-") == 0) { 163 /* Read stdin into buffer */ 164 size_t cap, len, n; 165 166 cap = 4096; 167 len = 0; 168 data = xmalloc(cap); 169 170 while ((n = fread(data + len, 1, cap - len, 171 stdin)) > 0) { 172 len += n; 173 if (len >= cap) { 174 cap *= 2; 175 if (cap > MAX_FILE_SIZE) 176 break; 177 data = xrealloc(data, cap); 178 } 179 } 180 181 data[len] = '\0'; 182 *out_len = len; 183 return data; 184 } 185 186 fp = fopen(path, "rb"); 187 if (!fp) 188 return NULL; 189 190 if (fseek(fp, 0, SEEK_END) != 0) { 191 fclose(fp); 192 return NULL; 193 } 194 195 fsize = ftell(fp); 196 if (fsize < 0 || fsize > MAX_FILE_SIZE) { 197 fclose(fp); 198 return NULL; 199 } 200 201 rewind(fp); 202 203 data = xmalloc(fsize + 1); 204 if (fread(data, 1, fsize, fp) != (size_t)fsize) { 205 free(data); 206 fclose(fp); 207 return NULL; 208 } 209 210 data[fsize] = '\0'; 211 fclose(fp); 212 213 *out_len = fsize; 214 return data; 215 } 216 217 /* Process a single file */ 218 static void 219 process_file(const char *path) 220 { 221 char *data; 222 size_t len; 223 224 if (verbose) 225 fprintf(stderr, "%s\n", path); 226 227 data = read_file(path, &len); 228 if (!data) { 229 if (verbose) 230 warn("cannot read: %s", path); 231 return; 232 } 233 234 if (len == 0) { 235 free(data); 236 return; 237 } 238 239 /* Skip binary files */ 240 if (is_binary(data, len)) { 241 if (verbose) 242 fprintf(stderr, " skip binary: %s\n", path); 243 free(data); 244 return; 245 } 246 247 extract_urls(data, len, url_found, NULL); 248 free(data); 249 } 250 251 /* Recursively walk a directory */ 252 static void 253 walk_dir(const char *dirpath) 254 { 255 DIR *d; 256 struct dirent *ent; 257 struct stat st; 258 char path[4096]; 259 260 d = opendir(dirpath); 261 if (!d) { 262 warn("cannot open directory: %s", dirpath); 263 return; 264 } 265 266 while ((ent = readdir(d)) != NULL) { 267 /* Skip hidden files and . / .. */ 268 if (ent->d_name[0] == '.') 269 continue; 270 271 /* Skip common non-content directories */ 272 if (strcmp(ent->d_name, "node_modules") == 0 || 273 strcmp(ent->d_name, ".git") == 0 || 274 strcmp(ent->d_name, "__pycache__") == 0 || 275 strcmp(ent->d_name, "vendor") == 0 || 276 strcmp(ent->d_name, ".next") == 0 || 277 strcmp(ent->d_name, "dist") == 0 || 278 strcmp(ent->d_name, "build") == 0) 279 continue; 280 281 snprintf(path, sizeof(path), "%s/%s", 282 dirpath, ent->d_name); 283 284 if (stat(path, &st) != 0) 285 continue; 286 287 if (S_ISDIR(st.st_mode)) { 288 walk_dir(path); 289 } else if (S_ISREG(st.st_mode)) { 290 if (is_text_ext(ent->d_name)) 291 process_file(path); 292 } 293 } 294 295 closedir(d); 296 } 297 298 int 299 main(int argc, char *argv[]) 300 { 301 const char *path; 302 struct stat st; 303 int opt; 304 305 while ((opt = getopt(argc, argv, "vuRh")) != -1) { 306 switch (opt) { 307 case 'v': 308 verbose = 1; 309 break; 310 case 'u': 311 dedup = 1; 312 break; 313 case 'R': 314 recurse = 1; 315 break; 316 case 'h': /* fallthrough */ 317 default: 318 usage(); 319 } 320 } 321 322 if (optind >= argc) 323 usage(); 324 325 path = argv[optind]; 326 327 /* Reading from stdin */ 328 if (strcmp(path, "-") == 0) { 329 process_file("-"); 330 goto done; 331 } 332 333 if (stat(path, &st) != 0) 334 die("cannot stat: %s:", path); 335 336 if (S_ISDIR(st.st_mode)) { 337 if (!recurse) 338 die("use -R to scan directories"); 339 walk_dir(path); 340 } else if (S_ISREG(st.st_mode)) { 341 process_file(path); 342 } else { 343 die("not a regular file or directory: %s", path); 344 } 345 346 done: 347 if (dedup) 348 dedup_free(); 349 return 0; 350 }