extract.c - sparser - Simple parser — extract URLs from text files

extract.c (4278B)
      1 /* See LICENSE file for copyright and license details.
      2  *
      3  * URL extraction from text content.
      4  *
      5  * Strategy: scan for "http://" and "https://" anchors,
      6  * then greedily extend the match character by character
      7  * until hitting a character that cannot be part of a URL.
      8  */
      9 
     10 #include <ctype.h>
     11 #include <stdio.h>
     12 #include <stdlib.h>
     13 #include <string.h>
     14 
     15 #include "config.h"
     16 #include "extract.h"
     17 #include "util.h"
     18 
     19 /*
     20  * Characters that are valid in a URL.
     21  * RFC 3986: unreserved / pct-encoded / sub-delims / ":" / "@"
     22  *           / "/" / "?" / "#" / "[" / "]"
     23  *
     24  * We exclude common trailing punctuation that typically isn't
     25  * part of the URL (periods, commas, parens when unbalanced,
     26  * angle brackets, quotes).
     27  */
     28 static int
     29 is_url_char(unsigned char c)
     30 {
     31 	if (isalnum(c))
     32 		return 1;
     33 
     34 	switch (c) {
     35 	case '-': case '.': case '_': case '~':  /* unreserved */
     36 	case ':': case '/': case '?': case '#':  /* gen-delims */
     37 	case '[': case ']': case '@':
     38 	case '!': case '$': case '&': case '\'': /* sub-delims */
     39 	case '(': case ')': case '*': case '+':
     40 	case ',': case ';': case '=':
     41 	case '%':                                /* pct-encoded */
     42 		return 1;
     43 	default:
     44 		return 0;
     45 	}
     46 }
     47 
     48 /*
     49  * Strip trailing punctuation that is commonly not part of URLs
     50  * when they appear in prose text. E.g.:
     51  *   "Visit https://example.com."  -> strip trailing "."
     52  *   "(see https://example.com)"   -> strip trailing ")"
     53  *   "https://example.com,"        -> strip trailing ","
     54  */
     55 static size_t
     56 strip_trailing(const char *url, size_t len)
     57 {
     58 	int parens;
     59 	size_t i;
     60 
     61 	while (len > 0) {
     62 		unsigned char c = url[len - 1];
     63 
     64 		/* Always strip trailing periods, commas, semicolons,
     65 		 * colons, exclamation marks */
     66 		if (c == '.' || c == ',' || c == ';' ||
     67 		    c == ':' || c == '!' || c == '\'') {
     68 			len--;
     69 			continue;
     70 		}
     71 
     72 		/* Strip trailing ) only if unbalanced */
     73 		if (c == ')') {
     74 			parens = 0;
     75 			for (i = 0; i < len; i++) {
     76 				if (url[i] == '(')
     77 					parens++;
     78 				else if (url[i] == ')')
     79 					parens--;
     80 			}
     81 			if (parens < 0) {
     82 				len--;
     83 				continue;
     84 			}
     85 		}
     86 
     87 		/* Strip trailing ] only if unbalanced */
     88 		if (c == ']') {
     89 			parens = 0;
     90 			for (i = 0; i < len; i++) {
     91 				if (url[i] == '[')
     92 					parens++;
     93 				else if (url[i] == ']')
     94 					parens--;
     95 			}
     96 			if (parens < 0) {
     97 				len--;
     98 				continue;
     99 			}
    100 		}
    101 
    102 		/* Strip trailing > (common in angle-bracket URLs) */
    103 		if (c == '>') {
    104 			len--;
    105 			continue;
    106 		}
    107 
    108 		break;
    109 	}
    110 
    111 	return len;
    112 }
    113 
    114 /*
    115  * Extract a single URL starting at the given position.
    116  * Returns the length of the URL, or 0 if invalid.
    117  */
    118 static size_t
    119 extract_one(const char *data, size_t pos, size_t total_len)
    120 {
    121 	size_t start, len;
    122 
    123 	start = pos;
    124 	len = 0;
    125 
    126 	/* Must start with http:// or https:// */
    127 	if (total_len - pos >= 8 &&
    128 	    strncmp(data + pos, "https://", 8) == 0) {
    129 		len = 8;
    130 	} else if (total_len - pos >= 7 &&
    131 	           strncmp(data + pos, "http://", 7) == 0) {
    132 		len = 7;
    133 	} else {
    134 		return 0;
    135 	}
    136 
    137 	/* Greedily extend while characters are valid URL chars */
    138 	while (start + len < total_len &&
    139 	       is_url_char((unsigned char)data[start + len])) {
    140 		len++;
    141 		if (len >= MAX_URL_LEN)
    142 			break;
    143 	}
    144 
    145 	/* Must have something after the protocol */
    146 	if ((data[start + 4] == 's' && len <= 8) || len <= 7)
    147 		return 0;
    148 
    149 	/* Strip trailing punctuation */
    150 	len = strip_trailing(data + start, len);
    151 
    152 	return len;
    153 }
    154 
    155 void
    156 extract_urls(const char *data, size_t len,
    157              UrlCallback cb, void *ctx)
    158 {
    159 	size_t pos, url_len;
    160 	char *url;
    161 
    162 	pos = 0;
    163 	while (pos < len) {
    164 		/* Scan for http:// or https:// */
    165 		if (data[pos] != 'h') {
    166 			pos++;
    167 			continue;
    168 		}
    169 
    170 		if (pos + 7 > len) {
    171 			pos++;
    172 			continue;
    173 		}
    174 
    175 		if (strncmp(data + pos, "http://", 7) != 0 &&
    176 		    strncmp(data + pos, "https://", 8) != 0) {
    177 			pos++;
    178 			continue;
    179 		}
    180 
    181 		url_len = extract_one(data, pos, len);
    182 		if (url_len == 0) {
    183 			pos++;
    184 			continue;
    185 		}
    186 
    187 		/* Copy URL and deliver via callback */
    188 		url = xmalloc(url_len + 1);
    189 		memcpy(url, data + pos, url_len);
    190 		url[url_len] = '\0';
    191 
    192 		cb(url, ctx);
    193 		free(url);
    194 
    195 		pos += url_len;
    196 	}
    197 }
    198 
    199 int
    200 is_binary(const char *data, size_t len)
    201 {
    202 	size_t i, check_len;
    203 
    204 	/* Check first 8KB for null bytes */
    205 	check_len = len < 8192 ? len : 8192;
    206 	for (i = 0; i < check_len; i++) {
    207 		if (data[i] == '\0')
    208 			return 1;
    209 	}
    210 
    211 	return 0;
    212 }
	sparser Simple parser — extract URLs from text files
	git clone git clone https://git.krisyotam.com/krisyotam/sparser.git
	Log \| Files \| Refs \| LICENSE