#include #include #include #include #include #include #include "http.h" #include "util.h" #define INIT_PAGE_SIZE 8192 #define MAX_PAGE_SIZE 1048576 #define MEM_INC_FACTOR 2 #define TIMEOUT_MS 10000 #define CONNECT_TIMEOUT_MS 3000 const char *useragents[] = { "AvaBot", NULL, }; size_t bufwritecb(const byte *ptr, size_t size, size_t nmemb, writecb_data_t *userdata) { for (size_t i = 0; i < userdata->modules->len; i++) { if (userdata->modules->data[i]->module.onpagewrite != NULL) { int rc; rc = userdata->modules->data[i]->module.onpagewrite( userdata->modules->data[i]->module.userdata, userdata->url, ptr, nmemb); if (rc != 0) error("module %s onpagewrite failed with code %d", userdata->modules->data[i]->name); } } size_t len = nmemb; while (userdata->begin + nmemb + 1 > userdata->end) { // Buffer is undersized size_t buf_len = userdata->end - userdata->base; if (buf_len >= MAX_PAGE_SIZE) break; size_t new_buf_len = buf_len * MEM_INC_FACTOR; if (new_buf_len > MAX_PAGE_SIZE) new_buf_len = MAX_PAGE_SIZE; byte *new_base = xrealloc(userdata->base, new_buf_len); userdata->end = new_base + new_buf_len; userdata->begin = new_base + (userdata->begin - userdata->base); userdata->base = new_base; } if (userdata->begin + nmemb + 1 > userdata->end) // Buffer is still undersized len = userdata->end - userdata->begin; memcpy(userdata->begin, ptr, len); userdata->begin[len] = '\0'; userdata->begin += len; return len; } bool is_redirect(int status) { return status == 301 || // Moved Permanently status == 302 || // Found status == 307 || // Temporary Redirect status == 308; // Permanent Redirect } size_t headerwritecb(const char *buffer, size_t _size, size_t nitems, headercb_data_t *userdata) { const char content_type_str[] = "content-type:", html_mime_str[] = "text/html", http_str[] = "HTTP/"; // Parses HTTP status line if (nitems < sizeof(http_str) - 1) return nitems; if (memcmp(buffer, http_str, sizeof(http_str) - 1) == 0) { // Header is an http status line userdata->num_requests++; const char *status_line = memchr(buffer, ' ', nitems); for (; *status_line == ' '; status_line++) { // Ensures that status_line...(buffer+nitems) can fit a status code (3 numbers) if (status_line > buffer + nitems - 3) { userdata->status = 0; return CURL_WRITEFUNC_ERROR; } } char code_str[4] = { status_line[0], status_line[1], status_line[2], '\0' }; userdata->status = atoi(code_str); if (userdata->status == 200 || is_redirect(userdata->status)) return nitems; return CURL_WRITEFUNC_ERROR; } if (userdata->status == 0 || is_redirect(userdata->status)) return nitems; // Parses Content-Type header if (userdata->flags & HEADERCB_CONTENT_TYPE_ENCOUNTERED) return nitems; // We need an extra byte to plop header_val to the byte after the ':' if (nitems < sizeof(content_type_str)) return nitems; for (size_t i = 0; i < sizeof(content_type_str)-1; i++) if (tolower(buffer[i]) != content_type_str[i]) return nitems; const char *header_val = buffer + sizeof(content_type_str); for (; *header_val == ' '; header_val++) // Ensures that header_val..(buffer+nitems) can fit "text/html" if (header_val > buffer + nitems - sizeof(html_mime_str) + 1) return nitems; userdata->flags |= HEADERCB_CONTENT_TYPE_ENCOUNTERED; if (memcmp(header_val, html_mime_str, sizeof(html_mime_str)-1) != 0) return CURL_WRITEFUNC_ERROR; userdata->flags |= HEADERCB_VALID_MIME; return nitems; } void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data) { memset(data, 0, sizeof(cbdata_t)); data->writecb_data.base = xmalloc(INIT_PAGE_SIZE); data->writecb_data.begin = data->writecb_data.base; data->writecb_data.end = data->writecb_data.base + INIT_PAGE_SIZE; data->writecb_data.url = url; data->writecb_data.modules = modules; } CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested) { CURL *curl_h = curl_easy_init(); CURLcode easy_res; if (curl_h == NULL) { error("curl failed to initialize\n"); return NULL; } if (/* 1MiB/s max send speed */ (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_SEND_SPEED_LARGE, 1024 * 1024)) != CURLE_OK || /* 1MiB/s max recv speed */ (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_RECV_SPEED_LARGE, 1024 * 1024)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_TIMEOUT_MS, TIMEOUT_MS)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_CONNECTTIMEOUT_MS, CONNECT_TIMEOUT_MS)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_FOLLOWLOCATION, 1)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAXREDIRS, 3)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_HTTPHEADER, host_entry->headers)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_PROTOCOLS_STR, "http,https")) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEFUNCTION, bufwritecb)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEDATA, &cbdata->writecb_data)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_URL, url) != CURLE_OK)) { error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res); curl_easy_cleanup(curl_h); return NULL; } if (!wasrequested && ((easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERFUNCTION, headerwritecb)) != CURLE_OK || (easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERDATA, &cbdata->headercb_data)) != CURLE_OK)) { error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res); curl_easy_cleanup(curl_h); return NULL; } return curl_h; }