sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

fetch.c (4034B)


      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 #include <unistd.h>
      7 #include <curl/curl.h>
      8 
      9 #include "config.h"
     10 #include "fetch.h"
     11 #include "util.h"
     12 
     13 static CURL *curl_handle = NULL;
     14 
     15 static size_t
     16 write_cb(void *contents, size_t size, size_t nmemb, void *userp)
     17 {
     18 	size_t realsize = size * nmemb;
     19 	Response *resp = (Response *)userp;
     20 	char *ptr;
     21 
     22 	ptr = xrealloc(resp->data, resp->size + realsize + 1);
     23 	resp->data = ptr;
     24 	memcpy(&(resp->data[resp->size]), contents, realsize);
     25 	resp->size += realsize;
     26 	resp->data[resp->size] = '\0';
     27 	return realsize;
     28 }
     29 
     30 /*
     31  * Check if an HTTP status code is transient (worth retrying).
     32  * 429 = rate limited, 5xx = server errors
     33  */
     34 static int
     35 is_transient(long code)
     36 {
     37 	return code == 429 || code == 500 || code == 502 ||
     38 	       code == 503 || code == 504;
     39 }
     40 
     41 void
     42 fetch_init(void)
     43 {
     44 	curl_global_init(CURL_GLOBAL_ALL);
     45 	curl_handle = curl_easy_init();
     46 	if (!curl_handle)
     47 		die("curl_easy_init failed");
     48 }
     49 
     50 void
     51 fetch_cleanup(void)
     52 {
     53 	if (curl_handle) {
     54 		curl_easy_cleanup(curl_handle);
     55 		curl_handle = NULL;
     56 	}
     57 	curl_global_cleanup();
     58 }
     59 
     60 Response *
     61 fetch_url(const char *url)
     62 {
     63 	Response *resp;
     64 	CURLcode res;
     65 	char *ct, *effective_url;
     66 	int attempt;
     67 
     68 	for (attempt = 0; attempt < FETCH_MAX_RETRIES; attempt++) {
     69 		if (attempt > 0) {
     70 			unsigned int delay;
     71 
     72 			delay = FETCH_RETRY_BASE * (1 << (attempt - 1));
     73 			warn("retry %d/%d for %s (waiting %us)",
     74 			     attempt, FETCH_MAX_RETRIES - 1, url, delay);
     75 			sleep(delay);
     76 		}
     77 
     78 		resp = xmalloc(sizeof(Response));
     79 		resp->data = xmalloc(1);
     80 		resp->data[0] = '\0';
     81 		resp->size = 0;
     82 		resp->content_type = NULL;
     83 		resp->status_code = 0;
     84 		resp->final_url = NULL;
     85 
     86 		curl_easy_reset(curl_handle);
     87 		curl_easy_setopt(curl_handle, CURLOPT_URL, url);
     88 		curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION,
     89 		                 write_cb);
     90 		curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA,
     91 		                 (void *)resp);
     92 		curl_easy_setopt(curl_handle, CURLOPT_USERAGENT,
     93 		                 USER_AGENT);
     94 		curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
     95 		curl_easy_setopt(curl_handle, CURLOPT_MAXREDIRS,
     96 		                 MAX_REDIRECTS);
     97 		curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT,
     98 		                 CONNECT_TIMEOUT);
     99 		curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT,
    100 		                 REQUEST_TIMEOUT);
    101 		curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 1L);
    102 		curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 2L);
    103 		curl_easy_setopt(curl_handle, CURLOPT_ACCEPT_ENCODING,
    104 		                 "");
    105 
    106 		res = curl_easy_perform(curl_handle);
    107 
    108 		if (res != CURLE_OK) {
    109 			/* Network-level failure */
    110 			if (res == CURLE_OPERATION_TIMEDOUT ||
    111 			    res == CURLE_COULDNT_CONNECT ||
    112 			    res == CURLE_GOT_NOTHING) {
    113 				warn("fetch: %s: %s",
    114 				     url, curl_easy_strerror(res));
    115 				response_free(resp);
    116 				resp = NULL;
    117 				continue;
    118 			}
    119 			/* Non-transient curl error */
    120 			warn("fetch: %s: %s",
    121 			     url, curl_easy_strerror(res));
    122 			response_free(resp);
    123 			return NULL;
    124 		}
    125 
    126 		curl_easy_getinfo(curl_handle,
    127 		                  CURLINFO_RESPONSE_CODE,
    128 		                  &resp->status_code);
    129 
    130 		ct = NULL;
    131 		if (curl_easy_getinfo(curl_handle,
    132 		                      CURLINFO_CONTENT_TYPE,
    133 		                      &ct) == CURLE_OK && ct)
    134 			resp->content_type = xstrdup(ct);
    135 
    136 		effective_url = NULL;
    137 		if (curl_easy_getinfo(curl_handle,
    138 		                      CURLINFO_EFFECTIVE_URL,
    139 		                      &effective_url) == CURLE_OK &&
    140 		    effective_url)
    141 			resp->final_url = xstrdup(effective_url);
    142 
    143 		/* Retry on transient HTTP errors */
    144 		if (is_transient(resp->status_code)) {
    145 			response_free(resp);
    146 			resp = NULL;
    147 			continue;
    148 		}
    149 
    150 		return resp;
    151 	}
    152 
    153 	/* All retries exhausted */
    154 	warn("fetch: gave up on %s after %d attempts",
    155 	     url, FETCH_MAX_RETRIES);
    156 	return resp;
    157 }
    158 
    159 void
    160 response_free(Response *resp)
    161 {
    162 	if (!resp)
    163 		return;
    164 	free(resp->data);
    165 	free(resp->content_type);
    166 	free(resp->final_url);
    167 	free(resp);
    168 }