sparser

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | LICENSE

commit f042087d967b02645376087c5eaa0b4259f93cb6
Author: Kris Yotam <krisyotam@protonmail.com>
Date:   Mon, 16 Feb 2026 02:43:07 -0600

Initial commit: sparser - Simple Parser

Suckless C tool that extracts external URLs from text files.
Supports HTML, Markdown, MDX, plain text. Can recursively walk
directories and deduplicate output. Zero external dependencies.

Diffstat:
A.claude/CLAUDE.md | 146+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A.gitignore | 2++
ALICENSE | 21+++++++++++++++++++++
AMakefile | 41+++++++++++++++++++++++++++++++++++++++++
Aconfig.h | 21+++++++++++++++++++++
Aextract.c | 212+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aextract.h | 22++++++++++++++++++++++
Asparser.c | 350+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Autil.c | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Autil.h | 22++++++++++++++++++++++
10 files changed, 938 insertions(+), 0 deletions(-)

diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md @@ -0,0 +1,146 @@ +# sparser — CLAUDE.md + +## Project + +sparser (Simple Parser) is a suckless tool that extracts external URLs +from text-based files. It handles HTML, Markdown (MD/MDX), plain text, +and other text files. It can process a single file, read from stdin, or +recursively walk a directory tree. Outputs one URL per line to stdout. + +Designed to pair with suploader for a pipeline: + sparser -R /content | suploader - + +## Coding Standards — Suckless C Style + +All code in this project MUST follow the suckless.org coding style: + +### Language +- C99 (ISO/IEC 9899:1999), no extensions +- POSIX.1-2008 (`_POSIX_C_SOURCE 200809L`) + +### Indentation & Whitespace +- Tabs for indentation (1 tab = 1 level) +- Spaces for alignment only, never for indentation +- No tabs except at the beginning of a line +- Maximum line length: 79 characters + +### Comments +- Use `/* */` only, never `//` +- Comment fallthrough cases in switch statements + +### Variables +- All declarations at the top of the block +- Pointer `*` adjacent to variable name: `char *p`, not `char* p` +- No C99 `bool`; use `int` (0/1) +- Global/static variables not used outside TU must be `static` + +### Functions +- Return type on its own line +- Function name at column 0 on next line (enables `grep ^funcname`) +- Opening `{` on its own line for functions +- Functions not used outside their file: `static` + +```c +static void +usage(void) +{ + fprintf(stderr, "usage: sparser [-v] [-R] path\n"); + exit(1); +} +``` + +### Braces +- Opening `{` on same line for control flow (if, for, while, switch) +- Closing `}` on its own line unless continuing (else, do-while) +- Use braces even for single statements when sibling branches use them + +### Naming +- lowercase_with_underscores for functions and variables +- UPPERCASE for macros and constants +- CamelCase for typedef'd struct types +- No `_t` suffix (reserved by POSIX) +- Prefix module functions with module name + +### Control Flow +- Space after `if`, `for`, `while`, `switch` +- No space after `(` or before `)` +- Use `goto` for cleanup/unwind, not nested ifs +- Return/exit early on failure +- Test against 0, not -1: `if (func() < 0)` + +### Error Handling +- All allocation checked; goto cleanup on failure +- `die()` for fatal errors (prints message, exits) +- `warn()` for recoverable errors (prints, continues) + +### File Organization Order +1. License header +2. System includes (alphabetical) +3. Local includes +4. Macros +5. Type definitions +6. Function declarations +7. Global variables +8. Function definitions (same order as declarations) + +### Headers +- System headers first, alphabetical +- Local headers after blank line +- No cyclic dependencies +- Include only what is needed + +## Architecture + +### Module Layout + +| Module | Prefix | File | Responsibility | +|--------|--------|------|----------------| +| Main | — | sparser.c | Entry point, directory walking, file dispatch | +| Extract | `extract_` | extract.c | URL extraction from text content | +| Utilities | `die`, `warn`, `x*` | util.c | Memory wrappers, string ops, error handling | +| Config | — | config.h | Compile-time constants | + +### Architecture Rules +- **Separate compilation.** Every .c file compiles independently. +- **No dynamic loading.** All features compiled in. +- **No external dependencies.** Pure C99 + POSIX. +- **Line-oriented output.** One URL per line to stdout. +- **Unix pipeline friendly.** Works with pipes, xargs, etc. + +## Build + +```sh +make # build sparser binary +make clean # remove build artifacts +make install # install to /usr/local/bin +``` + +Dependencies: none (pure C99 + POSIX) + +## Usage + +```sh +# Extract URLs from a single file +sparser page.html + +# Recursive directory scan +sparser -R /content + +# Read from stdin +cat file.md | sparser - + +# Verbose (show file names being processed) +sparser -v -R /content + +# Deduplicate output +sparser -u -R /content + +# Pipeline with suploader +sparser -u -R /content | suploader - +``` + +## Git Conventions + +- No `Co-Authored-By: Claude` lines +- Commit messages: imperative, <72 chars, no period +- One logical change per commit diff --git a/.gitignore b/.gitignore @@ -0,0 +1,2 @@ +sparser +*.o diff --git a/LICENSE b/LICENSE @@ -0,0 +1,21 @@ +MIT/X Consortium License + +(c) 2026 Kris Yotam <krisyotam@proton.me> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,41 @@ +# sparser - Simple Parser +# See LICENSE file for copyright and license details. + +VERSION = 0.1.0 + +# paths +PREFIX = /usr/local +MANPREFIX = $(PREFIX)/share/man + +# flags +CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_POSIX_C_SOURCE=200809L -DVERSION=\"$(VERSION)\" +CFLAGS = -std=c99 -pedantic -Wall -Wextra -Os $(CPPFLAGS) +LDFLAGS = + +# compiler +CC = cc + +# sources +SRC = sparser.c extract.c util.c +OBJ = $(SRC:.c=.o) + +all: sparser + +.c.o: + $(CC) $(CFLAGS) -c $< + +sparser: $(OBJ) + $(CC) -o $@ $(OBJ) $(LDFLAGS) + +clean: + rm -f sparser $(OBJ) + +install: all + mkdir -p $(DESTDIR)$(PREFIX)/bin + cp -f sparser $(DESTDIR)$(PREFIX)/bin + chmod 755 $(DESTDIR)$(PREFIX)/bin/sparser + +uninstall: + rm -f $(DESTDIR)$(PREFIX)/bin/sparser + +.PHONY: all clean install uninstall diff --git a/config.h b/config.h @@ -0,0 +1,21 @@ +/* See LICENSE file for copyright and license details. + * sparser - Simple Parser + * configuration header + */ + +#ifndef CONFIG_H +#define CONFIG_H + +/* Program metadata */ +#define PROG_NAME "sparser" +#define PROG_VERSION "0.1.0" + +/* File processing limits */ +#define MAX_FILE_SIZE (100 * 1024 * 1024) /* 100 MB max file */ +#define MAX_LINE_LEN 8192 +#define MAX_URL_LEN 4096 + +/* Supported text file extensions (checked during -R recursion) */ +/* Binary files and executables are always skipped */ + +#endif /* CONFIG_H */ diff --git a/extract.c b/extract.c @@ -0,0 +1,212 @@ +/* See LICENSE file for copyright and license details. + * + * URL extraction from text content. + * + * Strategy: scan for "http://" and "https://" anchors, + * then greedily extend the match character by character + * until hitting a character that cannot be part of a URL. + */ + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "config.h" +#include "extract.h" +#include "util.h" + +/* + * Characters that are valid in a URL. + * RFC 3986: unreserved / pct-encoded / sub-delims / ":" / "@" + * / "/" / "?" / "#" / "[" / "]" + * + * We exclude common trailing punctuation that typically isn't + * part of the URL (periods, commas, parens when unbalanced, + * angle brackets, quotes). + */ +static int +is_url_char(unsigned char c) +{ + if (isalnum(c)) + return 1; + + switch (c) { + case '-': case '.': case '_': case '~': /* unreserved */ + case ':': case '/': case '?': case '#': /* gen-delims */ + case '[': case ']': case '@': + case '!': case '$': case '&': case '\'': /* sub-delims */ + case '(': case ')': case '*': case '+': + case ',': case ';': case '=': + case '%': /* pct-encoded */ + return 1; + default: + return 0; + } +} + +/* + * Strip trailing punctuation that is commonly not part of URLs + * when they appear in prose text. E.g.: + * "Visit https://example.com." -> strip trailing "." + * "(see https://example.com)" -> strip trailing ")" + * "https://example.com," -> strip trailing "," + */ +static size_t +strip_trailing(const char *url, size_t len) +{ + int parens; + size_t i; + + while (len > 0) { + unsigned char c = url[len - 1]; + + /* Always strip trailing periods, commas, semicolons, + * colons, exclamation marks */ + if (c == '.' || c == ',' || c == ';' || + c == ':' || c == '!' || c == '\'') { + len--; + continue; + } + + /* Strip trailing ) only if unbalanced */ + if (c == ')') { + parens = 0; + for (i = 0; i < len; i++) { + if (url[i] == '(') + parens++; + else if (url[i] == ')') + parens--; + } + if (parens < 0) { + len--; + continue; + } + } + + /* Strip trailing ] only if unbalanced */ + if (c == ']') { + parens = 0; + for (i = 0; i < len; i++) { + if (url[i] == '[') + parens++; + else if (url[i] == ']') + parens--; + } + if (parens < 0) { + len--; + continue; + } + } + + /* Strip trailing > (common in angle-bracket URLs) */ + if (c == '>') { + len--; + continue; + } + + break; + } + + return len; +} + +/* + * Extract a single URL starting at the given position. + * Returns the length of the URL, or 0 if invalid. + */ +static size_t +extract_one(const char *data, size_t pos, size_t total_len) +{ + size_t start, len; + + start = pos; + len = 0; + + /* Must start with http:// or https:// */ + if (total_len - pos >= 8 && + strncmp(data + pos, "https://", 8) == 0) { + len = 8; + } else if (total_len - pos >= 7 && + strncmp(data + pos, "http://", 7) == 0) { + len = 7; + } else { + return 0; + } + + /* Greedily extend while characters are valid URL chars */ + while (start + len < total_len && + is_url_char((unsigned char)data[start + len])) { + len++; + if (len >= MAX_URL_LEN) + break; + } + + /* Must have something after the protocol */ + if ((data[start + 4] == 's' && len <= 8) || len <= 7) + return 0; + + /* Strip trailing punctuation */ + len = strip_trailing(data + start, len); + + return len; +} + +void +extract_urls(const char *data, size_t len, + UrlCallback cb, void *ctx) +{ + size_t pos, url_len; + char *url; + + pos = 0; + while (pos < len) { + /* Scan for http:// or https:// */ + if (data[pos] != 'h') { + pos++; + continue; + } + + if (pos + 7 > len) { + pos++; + continue; + } + + if (strncmp(data + pos, "http://", 7) != 0 && + strncmp(data + pos, "https://", 8) != 0) { + pos++; + continue; + } + + url_len = extract_one(data, pos, len); + if (url_len == 0) { + pos++; + continue; + } + + /* Copy URL and deliver via callback */ + url = xmalloc(url_len + 1); + memcpy(url, data + pos, url_len); + url[url_len] = '\0'; + + cb(url, ctx); + free(url); + + pos += url_len; + } +} + +int +is_binary(const char *data, size_t len) +{ + size_t i, check_len; + + /* Check first 8KB for null bytes */ + check_len = len < 8192 ? len : 8192; + for (i = 0; i < check_len; i++) { + if (data[i] == '\0') + return 1; + } + + return 0; +} diff --git a/extract.h b/extract.h @@ -0,0 +1,22 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef EXTRACT_H +#define EXTRACT_H + +#include <stddef.h> + +/* Callback invoked for each extracted URL. + * url: the extracted URL string + * ctx: user context pointer */ +typedef void (*UrlCallback)(const char *url, void *ctx); + +/* Extract all http/https URLs from a buffer. + * Calls cb for each URL found. + * Handles: plain text, HTML, Markdown, MDX */ +void extract_urls(const char *data, size_t len, + UrlCallback cb, void *ctx); + +/* Check if a file appears to be binary (contains null bytes) */ +int is_binary(const char *data, size_t len); + +#endif /* EXTRACT_H */ diff --git a/sparser.c b/sparser.c @@ -0,0 +1,350 @@ +/* See LICENSE file for copyright and license details. + * + * sparser - Simple Parser + * + * Extracts external URLs from text files. + * Supports HTML, Markdown, MDX, plain text. + * Can recursively walk directories. + */ + +#include <dirent.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "config.h" +#include "extract.h" +#include "util.h" + +/* Hash table for URL deduplication */ +#define DEDUP_SIZE 65521 + +typedef struct DeNode { + char *url; + struct DeNode *next; +} DeNode; + +/* Global options */ +static int verbose = 0; +static int recurse = 0; +static int dedup = 0; +static DeNode *dedup_table[DEDUP_SIZE]; + +static void +usage(void) +{ + fprintf(stderr, + "usage: sparser [-vuR] [path | -]\n" + "\n" + " -v verbose (print filenames to stderr)\n" + " -u deduplicate URLs\n" + " -R recursive directory scan\n" + "\n" + " path file or directory to scan\n" + " - read from stdin\n"); + exit(1); +} + +/* FNV-1a hash */ +static unsigned long +fnv1a(const char *s) +{ + unsigned long h = 2166136261UL; + + for (; *s; s++) { + h ^= (unsigned char)*s; + h *= 16777619UL; + } + return h; +} + +static int +dedup_seen(const char *url) +{ + unsigned long h; + DeNode *n; + + h = fnv1a(url) % DEDUP_SIZE; + for (n = dedup_table[h]; n; n = n->next) { + if (strcmp(n->url, url) == 0) + return 1; + } + return 0; +} + +static void +dedup_add(const char *url) +{ + unsigned long h; + DeNode *n; + + h = fnv1a(url) % DEDUP_SIZE; + n = xmalloc(sizeof(DeNode)); + n->url = xstrdup(url); + n->next = dedup_table[h]; + dedup_table[h] = n; +} + +static void +dedup_free(void) +{ + size_t i; + DeNode *n, *next; + + for (i = 0; i < DEDUP_SIZE; i++) { + for (n = dedup_table[i]; n; n = next) { + next = n->next; + free(n->url); + free(n); + } + } +} + +/* Callback for each extracted URL */ +static void +url_found(const char *url, void *ctx) +{ + (void)ctx; + + if (dedup) { + if (dedup_seen(url)) + return; + dedup_add(url); + } + + puts(url); +} + +/* Check if a filename has a text-like extension */ +static int +is_text_ext(const char *name) +{ + /* Common text extensions we want to process */ + static const char *exts[] = { + ".html", ".htm", ".xhtml", + ".md", ".mdx", ".markdown", + ".txt", ".text", ".rst", + ".xml", ".rss", ".atom", + ".json", ".yaml", ".yml", + ".css", ".js", ".jsx", ".ts", ".tsx", + ".org", ".adoc", ".tex", ".bib", + ".csv", ".tsv", + ".cfg", ".conf", ".ini", + ".sh", ".bash", ".zsh", ".fish", + ".py", ".rb", ".pl", ".c", ".h", + ".go", ".rs", ".java", ".hs", + NULL + }; + int i; + + for (i = 0; exts[i]; i++) { + if (str_ends_with(name, exts[i])) + return 1; + } + + /* Files without extension (README, LICENSE, etc.) */ + if (!strchr(name, '.')) + return 1; + + return 0; +} + +/* Read entire file into memory. Returns NULL on error. */ +static char * +read_file(const char *path, size_t *out_len) +{ + FILE *fp; + char *data; + long fsize; + + if (strcmp(path, "-") == 0) { + /* Read stdin into buffer */ + size_t cap, len, n; + + cap = 4096; + len = 0; + data = xmalloc(cap); + + while ((n = fread(data + len, 1, cap - len, + stdin)) > 0) { + len += n; + if (len >= cap) { + cap *= 2; + if (cap > MAX_FILE_SIZE) + break; + data = xrealloc(data, cap); + } + } + + data[len] = '\0'; + *out_len = len; + return data; + } + + fp = fopen(path, "rb"); + if (!fp) + return NULL; + + if (fseek(fp, 0, SEEK_END) != 0) { + fclose(fp); + return NULL; + } + + fsize = ftell(fp); + if (fsize < 0 || fsize > MAX_FILE_SIZE) { + fclose(fp); + return NULL; + } + + rewind(fp); + + data = xmalloc(fsize + 1); + if (fread(data, 1, fsize, fp) != (size_t)fsize) { + free(data); + fclose(fp); + return NULL; + } + + data[fsize] = '\0'; + fclose(fp); + + *out_len = fsize; + return data; +} + +/* Process a single file */ +static void +process_file(const char *path) +{ + char *data; + size_t len; + + if (verbose) + fprintf(stderr, "%s\n", path); + + data = read_file(path, &len); + if (!data) { + if (verbose) + warn("cannot read: %s", path); + return; + } + + if (len == 0) { + free(data); + return; + } + + /* Skip binary files */ + if (is_binary(data, len)) { + if (verbose) + fprintf(stderr, " skip binary: %s\n", path); + free(data); + return; + } + + extract_urls(data, len, url_found, NULL); + free(data); +} + +/* Recursively walk a directory */ +static void +walk_dir(const char *dirpath) +{ + DIR *d; + struct dirent *ent; + struct stat st; + char path[4096]; + + d = opendir(dirpath); + if (!d) { + warn("cannot open directory: %s", dirpath); + return; + } + + while ((ent = readdir(d)) != NULL) { + /* Skip hidden files and . / .. */ + if (ent->d_name[0] == '.') + continue; + + /* Skip common non-content directories */ + if (strcmp(ent->d_name, "node_modules") == 0 || + strcmp(ent->d_name, ".git") == 0 || + strcmp(ent->d_name, "__pycache__") == 0 || + strcmp(ent->d_name, "vendor") == 0 || + strcmp(ent->d_name, ".next") == 0 || + strcmp(ent->d_name, "dist") == 0 || + strcmp(ent->d_name, "build") == 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", + dirpath, ent->d_name); + + if (stat(path, &st) != 0) + continue; + + if (S_ISDIR(st.st_mode)) { + walk_dir(path); + } else if (S_ISREG(st.st_mode)) { + if (is_text_ext(ent->d_name)) + process_file(path); + } + } + + closedir(d); +} + +int +main(int argc, char *argv[]) +{ + const char *path; + struct stat st; + int opt; + + while ((opt = getopt(argc, argv, "vuRh")) != -1) { + switch (opt) { + case 'v': + verbose = 1; + break; + case 'u': + dedup = 1; + break; + case 'R': + recurse = 1; + break; + case 'h': /* fallthrough */ + default: + usage(); + } + } + + if (optind >= argc) + usage(); + + path = argv[optind]; + + /* Reading from stdin */ + if (strcmp(path, "-") == 0) { + process_file("-"); + goto done; + } + + if (stat(path, &st) != 0) + die("cannot stat: %s:", path); + + if (S_ISDIR(st.st_mode)) { + if (!recurse) + die("use -R to scan directories"); + walk_dir(path); + } else if (S_ISREG(st.st_mode)) { + process_file(path); + } else { + die("not a regular file or directory: %s", path); + } + +done: + if (dedup) + dedup_free(); + return 0; +} diff --git a/util.c b/util.c @@ -0,0 +1,101 @@ +/* See LICENSE file for copyright and license details. */ + +#include <ctype.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "util.h" + +void +die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + if (fmt[0] && fmt[strlen(fmt) - 1] == ':') { + fputc(' ', stderr); + perror(NULL); + } else { + fputc('\n', stderr); + } + exit(1); +} + +void +warn(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "warning: "); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); +} + +void * +xmalloc(size_t size) +{ + void *p = malloc(size); + + if (!p) + die("malloc:"); + return p; +} + +void * +xrealloc(void *ptr, size_t size) +{ + void *p = realloc(ptr, size); + + if (!p) + die("realloc:"); + return p; +} + +char * +xstrdup(const char *s) +{ + char *p = strdup(s); + + if (!p) + die("strdup:"); + return p; +} + +char * +str_trim(char *str) +{ + char *end; + + while (isspace((unsigned char)*str)) + str++; + if (*str == '\0') + return str; + end = str + strlen(str) - 1; + while (end > str && isspace((unsigned char)*end)) + end--; + end[1] = '\0'; + return str; +} + +int +str_starts_with(const char *str, const char *prefix) +{ + return strncmp(str, prefix, strlen(prefix)) == 0; +} + +int +str_ends_with(const char *str, const char *suffix) +{ + size_t slen = strlen(str); + size_t suflen = strlen(suffix); + + if (suflen > slen) + return 0; + return strcmp(str + slen - suflen, suffix) == 0; +} diff --git a/util.h b/util.h @@ -0,0 +1,22 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef UTIL_H +#define UTIL_H + +#include <stddef.h> + +/* Memory allocation with error handling */ +void *xmalloc(size_t size); +void *xrealloc(void *ptr, size_t size); +char *xstrdup(const char *s); + +/* String utilities */ +char *str_trim(char *str); +int str_starts_with(const char *str, const char *prefix); +int str_ends_with(const char *str, const char *suffix); + +/* Error handling */ +void die(const char *fmt, ...); +void warn(const char *fmt, ...); + +#endif /* UTIL_H */