suploader

Simple uploader — submit URLs to web archive services
git clone git clone https://git.krisyotam.com/krisyotam/suploader.git
Log | Files | Refs | LICENSE

suploader.c (6358B)


      1 /* See LICENSE file for copyright and license details.
      2  *
      3  * suploader - Simple Uploader
      4  *
      5  * Uploads URLs to web archive services (Internet Archive,
      6  * archive.today, Wikiwix). Reads URLs from a file or stdin.
      7  */
      8 
      9 #include <stdio.h>
     10 #include <stdlib.h>
     11 #include <string.h>
     12 #include <time.h>
     13 #include <unistd.h>
     14 
     15 #include "config.h"
     16 #include "services.h"
     17 #include "util.h"
     18 
     19 /* Global options */
     20 static int verbose = 0;
     21 static int daemon_mode = 0;
     22 static int delay_secs = DEFAULT_DELAY;
     23 static int random_order = 0;
     24 static int services = SVC_ALL;
     25 
     26 static void
     27 usage(void)
     28 {
     29 	fprintf(stderr,
     30 	    "usage: suploader [-vrD] [-d secs] [-s services]"
     31 	    " [file | -]\n"
     32 	    "\n"
     33 	    "  -v          verbose output\n"
     34 	    "  -r          random URL selection order\n"
     35 	    "  -D          daemon mode (loop forever)\n"
     36 	    "  -d secs     delay between submissions"
     37 	    " (default: %d)\n"
     38 	    "  -s services comma-separated: ia,archiveph,"
     39 	    "wikiwix (default: all)\n"
     40 	    "\n"
     41 	    "  file        file with URLs, one per line\n"
     42 	    "  -           read from stdin\n",
     43 	    DEFAULT_DELAY);
     44 	exit(1);
     45 }
     46 
     47 /* Read all URLs from a file into an array.
     48  * Returns array of strings, sets *count.
     49  * Caller frees the array and each string. */
     50 static char **
     51 read_urls(const char *path, int *count)
     52 {
     53 	FILE *fp;
     54 	char line[MAX_URL_LEN];
     55 	char **urls;
     56 	int n, cap;
     57 	char *trimmed;
     58 
     59 	if (strcmp(path, "-") == 0)
     60 		fp = stdin;
     61 	else
     62 		fp = fopen(path, "r");
     63 
     64 	if (!fp)
     65 		die("cannot open: %s:", path);
     66 
     67 	cap = 256;
     68 	n = 0;
     69 	urls = xmalloc(cap * sizeof(char *));
     70 
     71 	while (fgets(line, sizeof(line), fp)) {
     72 		/* Strip newline */
     73 		line[strcspn(line, "\r\n")] = '\0';
     74 		trimmed = str_trim(line);
     75 
     76 		/* Skip empty lines and comments */
     77 		if (!*trimmed || *trimmed == '#')
     78 			continue;
     79 
     80 		/* Must look like a URL */
     81 		if (!str_starts_with(trimmed, "http://") &&
     82 		    !str_starts_with(trimmed, "https://"))
     83 			continue;
     84 
     85 		if (n >= cap) {
     86 			cap *= 2;
     87 			urls = xrealloc(urls,
     88 			    cap * sizeof(char *));
     89 		}
     90 		urls[n++] = xstrdup(trimmed);
     91 	}
     92 
     93 	if (fp != stdin)
     94 		fclose(fp);
     95 
     96 	*count = n;
     97 	return urls;
     98 }
     99 
    100 /* Write remaining URLs back to the file (for daemon mode) */
    101 static void
    102 write_urls(const char *path, char **urls, int count)
    103 {
    104 	FILE *fp;
    105 	int i;
    106 
    107 	if (strcmp(path, "-") == 0)
    108 		return;
    109 
    110 	fp = fopen(path, "w");
    111 	if (!fp) {
    112 		warn("cannot write: %s", path);
    113 		return;
    114 	}
    115 
    116 	for (i = 0; i < count; i++) {
    117 		if (urls[i])
    118 			fprintf(fp, "%s\n", urls[i]);
    119 	}
    120 
    121 	fclose(fp);
    122 }
    123 
    124 /* Remove an entry from the URL array by setting it to NULL */
    125 static void
    126 remove_url(char **urls, int idx)
    127 {
    128 	free(urls[idx]);
    129 	urls[idx] = NULL;
    130 }
    131 
    132 /* Count non-NULL entries */
    133 static int
    134 count_remaining(char **urls, int count)
    135 {
    136 	int i, n;
    137 
    138 	n = 0;
    139 	for (i = 0; i < count; i++) {
    140 		if (urls[i])
    141 			n++;
    142 	}
    143 	return n;
    144 }
    145 
    146 /* Pick a random non-NULL index */
    147 static int
    148 pick_random(char **urls, int count)
    149 {
    150 	int remaining, target, i;
    151 
    152 	remaining = count_remaining(urls, count);
    153 	if (remaining == 0)
    154 		return -1;
    155 
    156 	target = rand() % remaining;
    157 	for (i = 0; i < count; i++) {
    158 		if (urls[i]) {
    159 			if (target == 0)
    160 				return i;
    161 			target--;
    162 		}
    163 	}
    164 	return -1;
    165 }
    166 
    167 /* Pick next non-NULL index sequentially */
    168 static int
    169 pick_next(char **urls, int count)
    170 {
    171 	int i;
    172 
    173 	for (i = 0; i < count; i++) {
    174 		if (urls[i])
    175 			return i;
    176 	}
    177 	return -1;
    178 }
    179 
    180 /* Parse the -s services flag */
    181 static int
    182 parse_services(const char *arg)
    183 {
    184 	int svc;
    185 	char *copy, *tok, *saveptr;
    186 
    187 	svc = 0;
    188 	copy = xstrdup(arg);
    189 
    190 	for (tok = strtok_r(copy, ",", &saveptr); tok;
    191 	     tok = strtok_r(NULL, ",", &saveptr)) {
    192 		tok = str_trim(tok);
    193 		if (strcmp(tok, "ia") == 0)
    194 			svc |= SVC_IA;
    195 		else if (strcmp(tok, "archiveph") == 0)
    196 			svc |= SVC_ARCHIVEPH;
    197 		else if (strcmp(tok, "wikiwix") == 0)
    198 			svc |= SVC_WIKIWIX;
    199 		else if (strcmp(tok, "all") == 0)
    200 			svc = SVC_ALL;
    201 		else
    202 			die("unknown service: %s", tok);
    203 	}
    204 
    205 	free(copy);
    206 	return svc ? svc : SVC_ALL;
    207 }
    208 
    209 int
    210 main(int argc, char *argv[])
    211 {
    212 	const char *file;
    213 	char **urls;
    214 	int count, idx, ok, processed, opt;
    215 	time_t start;
    216 
    217 	while ((opt = getopt(argc, argv, "vrDd:s:h")) != -1) {
    218 		switch (opt) {
    219 		case 'v':
    220 			verbose = 1;
    221 			break;
    222 		case 'r':
    223 			random_order = 1;
    224 			break;
    225 		case 'D':
    226 			daemon_mode = 1;
    227 			break;
    228 		case 'd':
    229 			delay_secs = atoi(optarg);
    230 			if (delay_secs < 1)
    231 				delay_secs = 1;
    232 			break;
    233 		case 's':
    234 			services = parse_services(optarg);
    235 			break;
    236 		case 'h': /* fallthrough */
    237 		default:
    238 			usage();
    239 		}
    240 	}
    241 
    242 	if (optind >= argc)
    243 		usage();
    244 
    245 	file = argv[optind];
    246 	srand(time(NULL));
    247 
    248 	fprintf(stderr, "%s %s\n", PROG_NAME, PROG_VERSION);
    249 	if (services & SVC_IA)
    250 		fprintf(stderr, "  service: Internet Archive\n");
    251 	if (services & SVC_ARCHIVEPH)
    252 		fprintf(stderr, "  service: archive.ph\n");
    253 	if (services & SVC_WIKIWIX)
    254 		fprintf(stderr, "  service: Wikiwix\n");
    255 	fprintf(stderr, "  delay: %ds\n", delay_secs);
    256 	if (daemon_mode)
    257 		fprintf(stderr, "  mode: daemon\n");
    258 	fprintf(stderr, "\n");
    259 
    260 	svc_init();
    261 
    262 	processed = 0;
    263 	start = time(NULL);
    264 
    265 	do {
    266 		urls = read_urls(file, &count);
    267 
    268 		if (count == 0) {
    269 			if (daemon_mode) {
    270 				if (verbose)
    271 					fprintf(stderr,
    272 					    "queue empty, waiting..."
    273 					    "\n");
    274 				sleep(delay_secs);
    275 				free(urls);
    276 				continue;
    277 			}
    278 			fprintf(stderr, "no URLs to process\n");
    279 			free(urls);
    280 			break;
    281 		}
    282 
    283 		fprintf(stderr, "loaded %d URL(s)\n", count);
    284 
    285 		while (count_remaining(urls, count) > 0) {
    286 			if (random_order)
    287 				idx = pick_random(urls, count);
    288 			else
    289 				idx = pick_next(urls, count);
    290 
    291 			if (idx < 0)
    292 				break;
    293 
    294 			fprintf(stderr, "[%d] %s\n",
    295 			    processed + 1, urls[idx]);
    296 
    297 			ok = svc_submit(urls[idx], services,
    298 			    verbose);
    299 			fprintf(stderr, "  %d service(s) OK\n", ok);
    300 
    301 			remove_url(urls, idx);
    302 			processed++;
    303 
    304 			/* Write back remaining URLs */
    305 			if (daemon_mode)
    306 				write_urls(file, urls, count);
    307 
    308 			/* Rate limit */
    309 			if (count_remaining(urls, count) > 0 ||
    310 			    daemon_mode)
    311 				sleep(delay_secs);
    312 
    313 			/* In non-daemon mode, process all URLs */
    314 			if (!daemon_mode)
    315 				continue;
    316 
    317 			/* In daemon mode, re-read file each cycle
    318 			 * in case new URLs were appended */
    319 			break;
    320 		}
    321 
    322 		/* Free URL array */
    323 		{
    324 			int i;
    325 
    326 			for (i = 0; i < count; i++)
    327 				free(urls[i]);
    328 			free(urls);
    329 		}
    330 
    331 	} while (daemon_mode);
    332 
    333 	fprintf(stderr,
    334 	    "\nDone: %d URLs processed in %lds\n",
    335 	    processed, (long)(time(NULL) - start));
    336 
    337 	svc_cleanup();
    338 	return 0;
    339 }