sparser

Simple parser — extract URLs from text files
git clone git clone https://git.krisyotam.com/krisyotam/sparser.git
Log | Files | Refs | LICENSE

sparser.c (6009B)


      1 /* See LICENSE file for copyright and license details.
      2  *
      3  * sparser - Simple Parser
      4  *
      5  * Extracts external URLs from text files.
      6  * Supports HTML, Markdown, MDX, plain text.
      7  * Can recursively walk directories.
      8  */
      9 
     10 #include <dirent.h>
     11 #include <stdio.h>
     12 #include <stdlib.h>
     13 #include <string.h>
     14 #include <sys/stat.h>
     15 #include <unistd.h>
     16 
     17 #include "config.h"
     18 #include "extract.h"
     19 #include "util.h"
     20 
     21 /* Hash table for URL deduplication */
     22 #define DEDUP_SIZE 65521
     23 
     24 typedef struct DeNode {
     25 	char *url;
     26 	struct DeNode *next;
     27 } DeNode;
     28 
     29 /* Global options */
     30 static int verbose = 0;
     31 static int recurse = 0;
     32 static int dedup = 0;
     33 static DeNode *dedup_table[DEDUP_SIZE];
     34 
     35 static void
     36 usage(void)
     37 {
     38 	fprintf(stderr,
     39 	    "usage: sparser [-vuR] [path | -]\n"
     40 	    "\n"
     41 	    "  -v    verbose (print filenames to stderr)\n"
     42 	    "  -u    deduplicate URLs\n"
     43 	    "  -R    recursive directory scan\n"
     44 	    "\n"
     45 	    "  path  file or directory to scan\n"
     46 	    "  -     read from stdin\n");
     47 	exit(1);
     48 }
     49 
     50 /* FNV-1a hash */
     51 static unsigned long
     52 fnv1a(const char *s)
     53 {
     54 	unsigned long h = 2166136261UL;
     55 
     56 	for (; *s; s++) {
     57 		h ^= (unsigned char)*s;
     58 		h *= 16777619UL;
     59 	}
     60 	return h;
     61 }
     62 
     63 static int
     64 dedup_seen(const char *url)
     65 {
     66 	unsigned long h;
     67 	DeNode *n;
     68 
     69 	h = fnv1a(url) % DEDUP_SIZE;
     70 	for (n = dedup_table[h]; n; n = n->next) {
     71 		if (strcmp(n->url, url) == 0)
     72 			return 1;
     73 	}
     74 	return 0;
     75 }
     76 
     77 static void
     78 dedup_add(const char *url)
     79 {
     80 	unsigned long h;
     81 	DeNode *n;
     82 
     83 	h = fnv1a(url) % DEDUP_SIZE;
     84 	n = xmalloc(sizeof(DeNode));
     85 	n->url = xstrdup(url);
     86 	n->next = dedup_table[h];
     87 	dedup_table[h] = n;
     88 }
     89 
     90 static void
     91 dedup_free(void)
     92 {
     93 	size_t i;
     94 	DeNode *n, *next;
     95 
     96 	for (i = 0; i < DEDUP_SIZE; i++) {
     97 		for (n = dedup_table[i]; n; n = next) {
     98 			next = n->next;
     99 			free(n->url);
    100 			free(n);
    101 		}
    102 	}
    103 }
    104 
    105 /* Callback for each extracted URL */
    106 static void
    107 url_found(const char *url, void *ctx)
    108 {
    109 	(void)ctx;
    110 
    111 	if (dedup) {
    112 		if (dedup_seen(url))
    113 			return;
    114 		dedup_add(url);
    115 	}
    116 
    117 	puts(url);
    118 }
    119 
    120 /* Check if a filename has a text-like extension */
    121 static int
    122 is_text_ext(const char *name)
    123 {
    124 	/* Common text extensions we want to process */
    125 	static const char *exts[] = {
    126 		".html", ".htm", ".xhtml",
    127 		".md", ".mdx", ".markdown",
    128 		".txt", ".text", ".rst",
    129 		".xml", ".rss", ".atom",
    130 		".json", ".yaml", ".yml",
    131 		".css", ".js", ".jsx", ".ts", ".tsx",
    132 		".org", ".adoc", ".tex", ".bib",
    133 		".csv", ".tsv",
    134 		".cfg", ".conf", ".ini",
    135 		".sh", ".bash", ".zsh", ".fish",
    136 		".py", ".rb", ".pl", ".c", ".h",
    137 		".go", ".rs", ".java", ".hs",
    138 		NULL
    139 	};
    140 	int i;
    141 
    142 	for (i = 0; exts[i]; i++) {
    143 		if (str_ends_with(name, exts[i]))
    144 			return 1;
    145 	}
    146 
    147 	/* Files without extension (README, LICENSE, etc.) */
    148 	if (!strchr(name, '.'))
    149 		return 1;
    150 
    151 	return 0;
    152 }
    153 
    154 /* Read entire file into memory. Returns NULL on error. */
    155 static char *
    156 read_file(const char *path, size_t *out_len)
    157 {
    158 	FILE *fp;
    159 	char *data;
    160 	long fsize;
    161 
    162 	if (strcmp(path, "-") == 0) {
    163 		/* Read stdin into buffer */
    164 		size_t cap, len, n;
    165 
    166 		cap = 4096;
    167 		len = 0;
    168 		data = xmalloc(cap);
    169 
    170 		while ((n = fread(data + len, 1, cap - len,
    171 		       stdin)) > 0) {
    172 			len += n;
    173 			if (len >= cap) {
    174 				cap *= 2;
    175 				if (cap > MAX_FILE_SIZE)
    176 					break;
    177 				data = xrealloc(data, cap);
    178 			}
    179 		}
    180 
    181 		data[len] = '\0';
    182 		*out_len = len;
    183 		return data;
    184 	}
    185 
    186 	fp = fopen(path, "rb");
    187 	if (!fp)
    188 		return NULL;
    189 
    190 	if (fseek(fp, 0, SEEK_END) != 0) {
    191 		fclose(fp);
    192 		return NULL;
    193 	}
    194 
    195 	fsize = ftell(fp);
    196 	if (fsize < 0 || fsize > MAX_FILE_SIZE) {
    197 		fclose(fp);
    198 		return NULL;
    199 	}
    200 
    201 	rewind(fp);
    202 
    203 	data = xmalloc(fsize + 1);
    204 	if (fread(data, 1, fsize, fp) != (size_t)fsize) {
    205 		free(data);
    206 		fclose(fp);
    207 		return NULL;
    208 	}
    209 
    210 	data[fsize] = '\0';
    211 	fclose(fp);
    212 
    213 	*out_len = fsize;
    214 	return data;
    215 }
    216 
    217 /* Process a single file */
    218 static void
    219 process_file(const char *path)
    220 {
    221 	char *data;
    222 	size_t len;
    223 
    224 	if (verbose)
    225 		fprintf(stderr, "%s\n", path);
    226 
    227 	data = read_file(path, &len);
    228 	if (!data) {
    229 		if (verbose)
    230 			warn("cannot read: %s", path);
    231 		return;
    232 	}
    233 
    234 	if (len == 0) {
    235 		free(data);
    236 		return;
    237 	}
    238 
    239 	/* Skip binary files */
    240 	if (is_binary(data, len)) {
    241 		if (verbose)
    242 			fprintf(stderr, "  skip binary: %s\n", path);
    243 		free(data);
    244 		return;
    245 	}
    246 
    247 	extract_urls(data, len, url_found, NULL);
    248 	free(data);
    249 }
    250 
    251 /* Recursively walk a directory */
    252 static void
    253 walk_dir(const char *dirpath)
    254 {
    255 	DIR *d;
    256 	struct dirent *ent;
    257 	struct stat st;
    258 	char path[4096];
    259 
    260 	d = opendir(dirpath);
    261 	if (!d) {
    262 		warn("cannot open directory: %s", dirpath);
    263 		return;
    264 	}
    265 
    266 	while ((ent = readdir(d)) != NULL) {
    267 		/* Skip hidden files and . / .. */
    268 		if (ent->d_name[0] == '.')
    269 			continue;
    270 
    271 		/* Skip common non-content directories */
    272 		if (strcmp(ent->d_name, "node_modules") == 0 ||
    273 		    strcmp(ent->d_name, ".git") == 0 ||
    274 		    strcmp(ent->d_name, "__pycache__") == 0 ||
    275 		    strcmp(ent->d_name, "vendor") == 0 ||
    276 		    strcmp(ent->d_name, ".next") == 0 ||
    277 		    strcmp(ent->d_name, "dist") == 0 ||
    278 		    strcmp(ent->d_name, "build") == 0)
    279 			continue;
    280 
    281 		snprintf(path, sizeof(path), "%s/%s",
    282 		    dirpath, ent->d_name);
    283 
    284 		if (stat(path, &st) != 0)
    285 			continue;
    286 
    287 		if (S_ISDIR(st.st_mode)) {
    288 			walk_dir(path);
    289 		} else if (S_ISREG(st.st_mode)) {
    290 			if (is_text_ext(ent->d_name))
    291 				process_file(path);
    292 		}
    293 	}
    294 
    295 	closedir(d);
    296 }
    297 
    298 int
    299 main(int argc, char *argv[])
    300 {
    301 	const char *path;
    302 	struct stat st;
    303 	int opt;
    304 
    305 	while ((opt = getopt(argc, argv, "vuRh")) != -1) {
    306 		switch (opt) {
    307 		case 'v':
    308 			verbose = 1;
    309 			break;
    310 		case 'u':
    311 			dedup = 1;
    312 			break;
    313 		case 'R':
    314 			recurse = 1;
    315 			break;
    316 		case 'h': /* fallthrough */
    317 		default:
    318 			usage();
    319 		}
    320 	}
    321 
    322 	if (optind >= argc)
    323 		usage();
    324 
    325 	path = argv[optind];
    326 
    327 	/* Reading from stdin */
    328 	if (strcmp(path, "-") == 0) {
    329 		process_file("-");
    330 		goto done;
    331 	}
    332 
    333 	if (stat(path, &st) != 0)
    334 		die("cannot stat: %s:", path);
    335 
    336 	if (S_ISDIR(st.st_mode)) {
    337 		if (!recurse)
    338 			die("use -R to scan directories");
    339 		walk_dir(path);
    340 	} else if (S_ISREG(st.st_mode)) {
    341 		process_file(path);
    342 	} else {
    343 		die("not a regular file or directory: %s", path);
    344 	}
    345 
    346 done:
    347 	if (dedup)
    348 		dedup_free();
    349 	return 0;
    350 }