extract.c (4278B)
1 /* See LICENSE file for copyright and license details. 2 * 3 * URL extraction from text content. 4 * 5 * Strategy: scan for "http://" and "https://" anchors, 6 * then greedily extend the match character by character 7 * until hitting a character that cannot be part of a URL. 8 */ 9 10 #include <ctype.h> 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 #include "config.h" 16 #include "extract.h" 17 #include "util.h" 18 19 /* 20 * Characters that are valid in a URL. 21 * RFC 3986: unreserved / pct-encoded / sub-delims / ":" / "@" 22 * / "/" / "?" / "#" / "[" / "]" 23 * 24 * We exclude common trailing punctuation that typically isn't 25 * part of the URL (periods, commas, parens when unbalanced, 26 * angle brackets, quotes). 27 */ 28 static int 29 is_url_char(unsigned char c) 30 { 31 if (isalnum(c)) 32 return 1; 33 34 switch (c) { 35 case '-': case '.': case '_': case '~': /* unreserved */ 36 case ':': case '/': case '?': case '#': /* gen-delims */ 37 case '[': case ']': case '@': 38 case '!': case '$': case '&': case '\'': /* sub-delims */ 39 case '(': case ')': case '*': case '+': 40 case ',': case ';': case '=': 41 case '%': /* pct-encoded */ 42 return 1; 43 default: 44 return 0; 45 } 46 } 47 48 /* 49 * Strip trailing punctuation that is commonly not part of URLs 50 * when they appear in prose text. E.g.: 51 * "Visit https://example.com." -> strip trailing "." 52 * "(see https://example.com)" -> strip trailing ")" 53 * "https://example.com," -> strip trailing "," 54 */ 55 static size_t 56 strip_trailing(const char *url, size_t len) 57 { 58 int parens; 59 size_t i; 60 61 while (len > 0) { 62 unsigned char c = url[len - 1]; 63 64 /* Always strip trailing periods, commas, semicolons, 65 * colons, exclamation marks */ 66 if (c == '.' || c == ',' || c == ';' || 67 c == ':' || c == '!' || c == '\'') { 68 len--; 69 continue; 70 } 71 72 /* Strip trailing ) only if unbalanced */ 73 if (c == ')') { 74 parens = 0; 75 for (i = 0; i < len; i++) { 76 if (url[i] == '(') 77 parens++; 78 else if (url[i] == ')') 79 parens--; 80 } 81 if (parens < 0) { 82 len--; 83 continue; 84 } 85 } 86 87 /* Strip trailing ] only if unbalanced */ 88 if (c == ']') { 89 parens = 0; 90 for (i = 0; i < len; i++) { 91 if (url[i] == '[') 92 parens++; 93 else if (url[i] == ']') 94 parens--; 95 } 96 if (parens < 0) { 97 len--; 98 continue; 99 } 100 } 101 102 /* Strip trailing > (common in angle-bracket URLs) */ 103 if (c == '>') { 104 len--; 105 continue; 106 } 107 108 break; 109 } 110 111 return len; 112 } 113 114 /* 115 * Extract a single URL starting at the given position. 116 * Returns the length of the URL, or 0 if invalid. 117 */ 118 static size_t 119 extract_one(const char *data, size_t pos, size_t total_len) 120 { 121 size_t start, len; 122 123 start = pos; 124 len = 0; 125 126 /* Must start with http:// or https:// */ 127 if (total_len - pos >= 8 && 128 strncmp(data + pos, "https://", 8) == 0) { 129 len = 8; 130 } else if (total_len - pos >= 7 && 131 strncmp(data + pos, "http://", 7) == 0) { 132 len = 7; 133 } else { 134 return 0; 135 } 136 137 /* Greedily extend while characters are valid URL chars */ 138 while (start + len < total_len && 139 is_url_char((unsigned char)data[start + len])) { 140 len++; 141 if (len >= MAX_URL_LEN) 142 break; 143 } 144 145 /* Must have something after the protocol */ 146 if ((data[start + 4] == 's' && len <= 8) || len <= 7) 147 return 0; 148 149 /* Strip trailing punctuation */ 150 len = strip_trailing(data + start, len); 151 152 return len; 153 } 154 155 void 156 extract_urls(const char *data, size_t len, 157 UrlCallback cb, void *ctx) 158 { 159 size_t pos, url_len; 160 char *url; 161 162 pos = 0; 163 while (pos < len) { 164 /* Scan for http:// or https:// */ 165 if (data[pos] != 'h') { 166 pos++; 167 continue; 168 } 169 170 if (pos + 7 > len) { 171 pos++; 172 continue; 173 } 174 175 if (strncmp(data + pos, "http://", 7) != 0 && 176 strncmp(data + pos, "https://", 8) != 0) { 177 pos++; 178 continue; 179 } 180 181 url_len = extract_one(data, pos, len); 182 if (url_len == 0) { 183 pos++; 184 continue; 185 } 186 187 /* Copy URL and deliver via callback */ 188 url = xmalloc(url_len + 1); 189 memcpy(url, data + pos, url_len); 190 url[url_len] = '\0'; 191 192 cb(url, ctx); 193 free(url); 194 195 pos += url_len; 196 } 197 } 198 199 int 200 is_binary(const char *data, size_t len) 201 { 202 size_t i, check_len; 203 204 /* Check first 8KB for null bytes */ 205 check_len = len < 8192 ? len : 8192; 206 for (i = 0; i < check_len; i++) { 207 if (data[i] == '\0') 208 return 1; 209 } 210 211 return 0; 212 }