suploader.c (6358B)
1 /* See LICENSE file for copyright and license details. 2 * 3 * suploader - Simple Uploader 4 * 5 * Uploads URLs to web archive services (Internet Archive, 6 * archive.today, Wikiwix). Reads URLs from a file or stdin. 7 */ 8 9 #include <stdio.h> 10 #include <stdlib.h> 11 #include <string.h> 12 #include <time.h> 13 #include <unistd.h> 14 15 #include "config.h" 16 #include "services.h" 17 #include "util.h" 18 19 /* Global options */ 20 static int verbose = 0; 21 static int daemon_mode = 0; 22 static int delay_secs = DEFAULT_DELAY; 23 static int random_order = 0; 24 static int services = SVC_ALL; 25 26 static void 27 usage(void) 28 { 29 fprintf(stderr, 30 "usage: suploader [-vrD] [-d secs] [-s services]" 31 " [file | -]\n" 32 "\n" 33 " -v verbose output\n" 34 " -r random URL selection order\n" 35 " -D daemon mode (loop forever)\n" 36 " -d secs delay between submissions" 37 " (default: %d)\n" 38 " -s services comma-separated: ia,archiveph," 39 "wikiwix (default: all)\n" 40 "\n" 41 " file file with URLs, one per line\n" 42 " - read from stdin\n", 43 DEFAULT_DELAY); 44 exit(1); 45 } 46 47 /* Read all URLs from a file into an array. 48 * Returns array of strings, sets *count. 49 * Caller frees the array and each string. */ 50 static char ** 51 read_urls(const char *path, int *count) 52 { 53 FILE *fp; 54 char line[MAX_URL_LEN]; 55 char **urls; 56 int n, cap; 57 char *trimmed; 58 59 if (strcmp(path, "-") == 0) 60 fp = stdin; 61 else 62 fp = fopen(path, "r"); 63 64 if (!fp) 65 die("cannot open: %s:", path); 66 67 cap = 256; 68 n = 0; 69 urls = xmalloc(cap * sizeof(char *)); 70 71 while (fgets(line, sizeof(line), fp)) { 72 /* Strip newline */ 73 line[strcspn(line, "\r\n")] = '\0'; 74 trimmed = str_trim(line); 75 76 /* Skip empty lines and comments */ 77 if (!*trimmed || *trimmed == '#') 78 continue; 79 80 /* Must look like a URL */ 81 if (!str_starts_with(trimmed, "http://") && 82 !str_starts_with(trimmed, "https://")) 83 continue; 84 85 if (n >= cap) { 86 cap *= 2; 87 urls = xrealloc(urls, 88 cap * sizeof(char *)); 89 } 90 urls[n++] = xstrdup(trimmed); 91 } 92 93 if (fp != stdin) 94 fclose(fp); 95 96 *count = n; 97 return urls; 98 } 99 100 /* Write remaining URLs back to the file (for daemon mode) */ 101 static void 102 write_urls(const char *path, char **urls, int count) 103 { 104 FILE *fp; 105 int i; 106 107 if (strcmp(path, "-") == 0) 108 return; 109 110 fp = fopen(path, "w"); 111 if (!fp) { 112 warn("cannot write: %s", path); 113 return; 114 } 115 116 for (i = 0; i < count; i++) { 117 if (urls[i]) 118 fprintf(fp, "%s\n", urls[i]); 119 } 120 121 fclose(fp); 122 } 123 124 /* Remove an entry from the URL array by setting it to NULL */ 125 static void 126 remove_url(char **urls, int idx) 127 { 128 free(urls[idx]); 129 urls[idx] = NULL; 130 } 131 132 /* Count non-NULL entries */ 133 static int 134 count_remaining(char **urls, int count) 135 { 136 int i, n; 137 138 n = 0; 139 for (i = 0; i < count; i++) { 140 if (urls[i]) 141 n++; 142 } 143 return n; 144 } 145 146 /* Pick a random non-NULL index */ 147 static int 148 pick_random(char **urls, int count) 149 { 150 int remaining, target, i; 151 152 remaining = count_remaining(urls, count); 153 if (remaining == 0) 154 return -1; 155 156 target = rand() % remaining; 157 for (i = 0; i < count; i++) { 158 if (urls[i]) { 159 if (target == 0) 160 return i; 161 target--; 162 } 163 } 164 return -1; 165 } 166 167 /* Pick next non-NULL index sequentially */ 168 static int 169 pick_next(char **urls, int count) 170 { 171 int i; 172 173 for (i = 0; i < count; i++) { 174 if (urls[i]) 175 return i; 176 } 177 return -1; 178 } 179 180 /* Parse the -s services flag */ 181 static int 182 parse_services(const char *arg) 183 { 184 int svc; 185 char *copy, *tok, *saveptr; 186 187 svc = 0; 188 copy = xstrdup(arg); 189 190 for (tok = strtok_r(copy, ",", &saveptr); tok; 191 tok = strtok_r(NULL, ",", &saveptr)) { 192 tok = str_trim(tok); 193 if (strcmp(tok, "ia") == 0) 194 svc |= SVC_IA; 195 else if (strcmp(tok, "archiveph") == 0) 196 svc |= SVC_ARCHIVEPH; 197 else if (strcmp(tok, "wikiwix") == 0) 198 svc |= SVC_WIKIWIX; 199 else if (strcmp(tok, "all") == 0) 200 svc = SVC_ALL; 201 else 202 die("unknown service: %s", tok); 203 } 204 205 free(copy); 206 return svc ? svc : SVC_ALL; 207 } 208 209 int 210 main(int argc, char *argv[]) 211 { 212 const char *file; 213 char **urls; 214 int count, idx, ok, processed, opt; 215 time_t start; 216 217 while ((opt = getopt(argc, argv, "vrDd:s:h")) != -1) { 218 switch (opt) { 219 case 'v': 220 verbose = 1; 221 break; 222 case 'r': 223 random_order = 1; 224 break; 225 case 'D': 226 daemon_mode = 1; 227 break; 228 case 'd': 229 delay_secs = atoi(optarg); 230 if (delay_secs < 1) 231 delay_secs = 1; 232 break; 233 case 's': 234 services = parse_services(optarg); 235 break; 236 case 'h': /* fallthrough */ 237 default: 238 usage(); 239 } 240 } 241 242 if (optind >= argc) 243 usage(); 244 245 file = argv[optind]; 246 srand(time(NULL)); 247 248 fprintf(stderr, "%s %s\n", PROG_NAME, PROG_VERSION); 249 if (services & SVC_IA) 250 fprintf(stderr, " service: Internet Archive\n"); 251 if (services & SVC_ARCHIVEPH) 252 fprintf(stderr, " service: archive.ph\n"); 253 if (services & SVC_WIKIWIX) 254 fprintf(stderr, " service: Wikiwix\n"); 255 fprintf(stderr, " delay: %ds\n", delay_secs); 256 if (daemon_mode) 257 fprintf(stderr, " mode: daemon\n"); 258 fprintf(stderr, "\n"); 259 260 svc_init(); 261 262 processed = 0; 263 start = time(NULL); 264 265 do { 266 urls = read_urls(file, &count); 267 268 if (count == 0) { 269 if (daemon_mode) { 270 if (verbose) 271 fprintf(stderr, 272 "queue empty, waiting..." 273 "\n"); 274 sleep(delay_secs); 275 free(urls); 276 continue; 277 } 278 fprintf(stderr, "no URLs to process\n"); 279 free(urls); 280 break; 281 } 282 283 fprintf(stderr, "loaded %d URL(s)\n", count); 284 285 while (count_remaining(urls, count) > 0) { 286 if (random_order) 287 idx = pick_random(urls, count); 288 else 289 idx = pick_next(urls, count); 290 291 if (idx < 0) 292 break; 293 294 fprintf(stderr, "[%d] %s\n", 295 processed + 1, urls[idx]); 296 297 ok = svc_submit(urls[idx], services, 298 verbose); 299 fprintf(stderr, " %d service(s) OK\n", ok); 300 301 remove_url(urls, idx); 302 processed++; 303 304 /* Write back remaining URLs */ 305 if (daemon_mode) 306 write_urls(file, urls, count); 307 308 /* Rate limit */ 309 if (count_remaining(urls, count) > 0 || 310 daemon_mode) 311 sleep(delay_secs); 312 313 /* In non-daemon mode, process all URLs */ 314 if (!daemon_mode) 315 continue; 316 317 /* In daemon mode, re-read file each cycle 318 * in case new URLs were appended */ 319 break; 320 } 321 322 /* Free URL array */ 323 { 324 int i; 325 326 for (i = 0; i < count; i++) 327 free(urls[i]); 328 free(urls); 329 } 330 331 } while (daemon_mode); 332 333 fprintf(stderr, 334 "\nDone: %d URLs processed in %lds\n", 335 processed, (long)(time(NULL) - start)); 336 337 svc_cleanup(); 338 return 0; 339 }