sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

detect.h (1116B)


      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #ifndef DETECT_H
      4 #define DETECT_H
      5 
      6 /* Detected site/CMS types */
      7 typedef enum {
      8 	SITE_UNKNOWN,
      9 	SITE_WORDPRESS,
     10 	SITE_BLOGGER,
     11 	SITE_HUGO,
     12 	SITE_JEKYLL,
     13 	SITE_GHOST,
     14 	SITE_DRUPAL,
     15 	SITE_MEDIAWIKI
     16 } SiteType;
     17 
     18 /* Detection result with hints for archiving */
     19 typedef struct {
     20 	SiteType type;
     21 	const char *name;       /* human-readable CMS name */
     22 	char *feed_url;         /* RSS/Atom feed URL if found */
     23 	char *api_url;          /* REST API base if found */
     24 	char *sitemap_url;      /* sitemap.xml URL if found */
     25 	int has_json_api;       /* site has a JSON API */
     26 } SiteInfo;
     27 
     28 /* Detect CMS type from HTML and URL */
     29 SiteInfo *detect_site(const char *html, const char *url);
     30 
     31 /* Free detection result */
     32 void siteinfo_free(SiteInfo *info);
     33 
     34 /* Get sitemap URLs for a detected site */
     35 char **detect_sitemap_urls(SiteInfo *info, const char *domain,
     36                            int *count);
     37 
     38 /* Get additional seed URLs based on CMS type */
     39 char **detect_seed_urls(SiteInfo *info, const char *domain,
     40                         int *count);
     41 
     42 #endif /* DETECT_H */