/*----------------------------------------------------------------------*\ | Get real web access stats for yesterday. | | | | Peter N. Schweitzer (U.S. Geological Survey, Reston VA 20192) | \*----------------------------------------------------------------------*/ #include #include #include #include #include #include #include extern char *hostname_of (char *ip); struct entry { int line_number; char *host; char *when; char *request_method; char *url; char *protocol; int code; int bytes; char *refer; char *agent; char *forward; }; /*----------------------------------------------------------------------*\ | Get the entire log file as one block of memory. | \*----------------------------------------------------------------------*/ static char *read_text (char *name) { long n; FILE *fp; char *dst,*end; long size; char *number; if (!name) return (NULL); dst = NULL; if (fp = fopen (name,"rb")) { if (fseek (fp,0L,SEEK_END) == 0) { size = 1L + ftell (fp); if (size > 1L) { if (dst = (char *) malloc (size)) { rewind (fp); n = fread (dst,1,size,fp); end = dst + n; *end = 0; } else fprintf (stderr,"Error: could not allocate space for file\n"); } else fprintf (stderr,"Error: Input file is empty.\n"); } fclose (fp); } else fprintf (stderr,"Error: could not open input file %s\n",name); return (dst); } /*----------------------------------------------------------------------*\ | Ignore hits by robots and spiders; use key text in the agent string | | to recognize these users. | \*----------------------------------------------------------------------*/ static char *spider_name_file = "spiders.txt"; extern int is_spider (char *agent, char *host); extern int read_spider_names (char *spider_name_file); /*----------------------------------------------------------------------*\ | Determine the real size of a file; so we can compare the number of | | bytes transferred to the file size. Count only those transfers in | | which the whole file was downloaded. | \*----------------------------------------------------------------------*/ static int file_size (char *url) { struct stat info; char name [FILENAME_MAX]; strcpy (name,"/var/lib/httpd/htdocs"); if (FILENAME_MAX > strlen (name) + strlen (url)) { strcat (name,url); if (stat (name,&info) == 0) if ((info.st_mode & S_IFMT) == S_IFREG) return (info.st_size); } return (0); } /*----------------------------------------------------------------------*\ | Count only documents that are metadata records. These will have a | | URL that contains "metadata/" followed by a series abbreviation. | \*----------------------------------------------------------------------*/ struct series { char *signature; char *name; }; static struct series series_list[] = { {"metadata/bulletin/", "Bulletin"}, {"metadata/digital-data/", "Digital Data Series"}, {"metadata/map-i/", "Miscellaneous Investigations Map"}, {"metadata/map-mf/", "Miscellaneous Field Studies Map"}, {"metadata/mineral/", "Mineral commodity and minerals industry reports"}, {"metadata/mining-and-quarrying/", "Mining and Quarrying"}, {"metadata/open-file/", "Open-File Report"}, {"metadata/other/", "Reports published by other organizations"}, {"metadata/professional-paper/", "Professional Paper"}, {"metadata/provisional/", "Information in provisional release"}, {NULL,NULL} }; static char *series_of (char *url) { int i; for (i=0; series_list[i].signature; i++) if (strstr (url,series_list[i].signature)) return (series_list[i].name); return (NULL); } /*----------------------------------------------------------------------*\ | Don't count duplicate downloads (one document downloaded by the same | | host twice or more). | \*----------------------------------------------------------------------*/ struct download { char *host; char *url; char *agent; char *refer; }; static struct download *download = NULL; static int download_size = 0; static int download_count = 0; #define GRANULARITY 65536 static int compare (const void *e1, const void *e2) { struct download *p1 = (struct download *) e1; struct download *p2 = (struct download *) e2; int d = strcmp (p1->host,p2->host); if (d == 0) d = strcmp (p1->url,p2->url); return (d); } static void remember_download (char *host, char *url, char *agent, char *refer) { if (download_size == 0) { if (download = (struct download *) malloc (GRANULARITY * sizeof (struct download))) { memset (download+download_count,0,sizeof (struct download)); download_size = GRANULARITY; } } if (download) { if (download_count < download_size) { download [download_count].host = host; download [download_count].url = url; download [download_count].agent = agent; download [download_count].refer = refer; download_count++; memset (download+download_count,0,sizeof (struct download)); } else { struct download *d; if (d = (struct download *) realloc (download,(download_size + GRANULARITY) * sizeof (struct download))) { download = d; download_size += GRANULARITY; download [download_count].host = host; download [download_count].url = url; download [download_count].agent = agent; download [download_count].refer = refer; download_count++; memset (download+download_count,0,sizeof (struct download)); qsort (download,download_count,sizeof(struct download),compare); } } } } static int repeat (char *host, char *url) { int i; struct download target, *match; target.host = host; target.url = url; match = bsearch (&target,download,download_count,sizeof(struct download),compare); if (match != NULL) return (1); else return (0); } static int count_unique_hosts (void) { int i,n; char *h; qsort (download,download_count,sizeof(struct download),compare); n = 0; h = ""; for (i=0; i < download_count; i++) if (strcmp (download[i].host,h) != 0) { n++; h = download[i].host; } return (n); } static int is_directory (char *name) { struct stat info; if (stat (name,&info) == 0) if (S_ISDIR (info.st_mode)) return (1); return (0); } /* Determine whether an IP address given as a string is numeric */ static int is_numeric (char *addr) { char *s; int result = 1; if (!addr || !*addr) return (0); for (s=addr; *s; s++) if (!(*s == '.' || isdigit(*s))) { result = 0; break; } return (result); } static char *this_host = "http://geo-nsdi.er.usgs.gov/"; static char *document_root = "/var/lib/httpd/htdocs"; static void write_refs (void) { int i; char *s, *t; char name [FILENAME_MAX]; char dir [FILENAME_MAX]; FILE *out; char *refer,*url; for (i=0; i < download_count; i++) { refer = download[i].refer; url = download[i].url; if (strlen (refer) > 1) if (memcmp (refer,this_host,strlen(this_host)) != 0) if (strchr (refer,'?')) { strcpy (name,document_root); if (FILENAME_MAX > strlen (name) + strlen (url)) { strcat (name,url); if (s = strrchr (name,'.')) { if (strcmp (s,".html") == 0) if (t = strstr (name,".faq.html")) s = t; strcpy (s,".ref"); /* Store the referers in a parallel directory structure */ if (s = strstr (name,"/metadata/")) memcpy (s,"/referers/",10); /* Ensure that the directory exists */ strcpy (dir,name); if (s = strrchr (dir,'/')) *s = 0; if (!is_directory (dir)) if (mkdir (dir,0755) == -1) fprintf (stderr,"Error: failed to create %s\n",dir); if (out = fopen (name,"a")) { fprintf (out,"%s\n",refer); fclose (out); } else fprintf (stderr,"Error: could not open referer file %s\n",name); } } } } } /*----------------------------------------------------------------------*\ | Main program | \*----------------------------------------------------------------------*/ int main (int argc, char *argv[]) { char *input_file = NULL; char *buffer, *ptr, *eol, *s, *t; struct entry *list,*p; int entry_count; FILE *out; int i,k; char today [64]; char yesterday [64]; time_t now = time (NULL); time_t then = now - 24*60*60; char *number; char output_file [FILENAME_MAX]; int line_number = 0; int met_count = 0; int txt_count = 0; int html_count = 0; int faq_count = 0; int sgml_count = 0; int xml_count = 0; int dif_count = 0; int verbose = 0; /*------------------------------------------------------------------*\ | Derive the string that we will use to determine that an access | | occurred yesterday. If the user has specified a command-line | | argument, use that as the pattern to be matched instead. | | That allows you to collect stats for a whole month at a time. | \*------------------------------------------------------------------*/ strftime (today,64,"%d/%b/%Y",localtime (&now)); strftime (yesterday,64,"%d/%b/%Y",localtime (&then)); if (argc > 1) if (strlen (argv[1]) < 64) strcpy (yesterday,argv[1]); if (!read_spider_names (spider_name_file)) { fprintf (stderr,"Error: could not read spider name file %s\n",spider_name_file); exit (1); } /*------------------------------------------------------------------*\ | Read the web access log, parsing each entry. | \*------------------------------------------------------------------*/ input_file = "/var/lib/httpd/logs/access_log"; if (buffer = read_text (input_file)) { int n = 1; for (ptr = buffer; *ptr; *ptr++) if (*ptr == '\n') n++; if (list = (struct entry *) malloc (n * sizeof (struct entry))) { int k = 0; struct entry *p = list; ptr = buffer; while (*ptr) { for (eol=ptr; *eol; eol++) { if (*eol == '\r') *eol++ = 0; if (*eol == '\n') { *eol++ = 0; break; } } line_number++; if (*ptr && strstr (ptr,yesterday)) { p->line_number = line_number; p->host = ptr; if (*ptr == '"') { ptr++; p->host = ptr; while (*ptr && *ptr != '"') ptr++; if (*ptr == '"') *ptr++ = 0; } else { while (*ptr && !isspace (*ptr)) ptr++; if (*ptr) *ptr++ = 0; } /*--------------------------------------------------*\ | Skip user name and password, go directly to the | | opening brace that begins the date field. | \*--------------------------------------------------*/ while (*ptr && *ptr != '[') ptr++; if (*ptr == '[') { ptr++; for (s=ptr; *s && *s != ']'; s++); if (*s == ']') *s++ = 0; if (isspace(*s)) *s++ = 0; p->when = ptr; ptr = s; } if (*ptr == '"') { ptr++; for (s=ptr; *s && *s != '"'; s++); if (*s == '"') *s++ = 0; if (isspace(*s)) *s++ = 0; p->request_method = ptr; while (*ptr && !isspace (*ptr)) ptr++; if (*ptr) *ptr++ = 0; p->url = ptr; while (*ptr && !isspace (*ptr)) ptr++; if (*ptr) *ptr++ = 0; p->protocol = ptr; while (*ptr && !isspace (*ptr)) ptr++; if (*ptr) *ptr++ = 0; ptr = s; } number = ptr; while (*ptr && !isspace (*ptr)) ptr++; if (*ptr) *ptr++ = 0; p->code = (int) strtoul (number,0,0); number = ptr; while (*ptr && !isspace (*ptr)) ptr++; if (*ptr) *ptr++ = 0; p->bytes = (int) strtoul (number,0,0); if (*ptr == '"') { ptr++; for (s=ptr; *s && *s != '"'; s++); if (*s == '"') *s++ = 0; if (isspace(*s)) *s++ = 0; p->refer = ptr; ptr = s; } if (*ptr == '"') { ptr++; for (s=ptr; *s && *s != '"'; s++); if (*s == '"') *s++ = 0; if (isspace(*s)) *s++ = 0; p->agent = ptr; ptr = s; } if (*ptr == '"') { ptr++; for (s=ptr; *s && *s != '"'; s++); if (*s == '"') *s++ = 0; if (isspace(*s)) *s++ = 0; p->forward = ptr; ptr = s; } if (verbose) { char string [512]; printf ("%d:\n",p - list); printf (" host: %d\n",p->line_number); printf (" host: %s\n",p->host); printf (" when: %s\n",p->when); printf (" method: %s\n",p->request_method); printf (" url: %s\n",p->url); printf ("protocol: %s\n",p->protocol); printf (" code: %d\n",p->code); printf (" bytes: %d\n",p->bytes); printf (" referer: %s\n",p->refer); printf (" agent: %s\n",p->agent); printf (" forward: %s\n",p->forward); // fgets (string, 512,stdin); } p++; } ptr = eol; } entry_count = p - list; /*----------------------------------------------------------*\ | Look through all of the entries. Remember only those | | that pass a number of tests designed to ignore activity | | other than real downloads of real metdata by real people | | outside USGS. | \*----------------------------------------------------------*/ p = list; for (i=0; i < entry_count; i++) { /* Must be a metadata record */ if (series_of (p->url)) { /* Must be successful download */ if (p->code == 200) { /*--------------------------------------------------*\ | X-Forwarded-For may contain more than one address| | separated by comma and space. This happens, I | | think, when a user goes through a proxy server | | to access the internet or when a second reverse | | proxy is involved in the exchange. | | | | If the forward value contains a comma, terminate | | the string there. This discards the second and | | later ip addresses; hopefully the first address | | is that of the original requestor. | \*--------------------------------------------------*/ if (s = strchr (p->host,',')) *s = 0; /* If the address of the host is numeric, look it up */ if (is_numeric (p->host)) if (s = hostname_of (p->host)) if (t = strdup (s)) p->host = t; /* Must be a user outside USGS */ if (!strstr (p->host,".usgs.gov")) { /* Must not be a web spider or robot */ if (!is_spider (p->agent,p->host)) { /* Must not be an error report */ if (!strstr (p->url,".err")) { /* Must be the whole file */ if (p->bytes >= file_size (p->url)) { /* Must not be a repeat download */ if (!repeat (p->host,p->url)) { remember_download (p->host,p->url,p->agent,p->refer); if (strstr (p->url,".xml")) xml_count++; if (strstr (p->url,".sgml")) sgml_count++; if (strstr (p->url,".met")) met_count++; if (strstr (p->url,".txt")) txt_count++; if (strstr (p->url,".dif")) dif_count++; if (strstr (p->url,".html")) if (strstr (p->url,".faq")) faq_count++; else html_count++; } } } } } } } p++; } /*----------------------------------------------------------*\ | Write a line of statistics into an HTML file. This file | | will be included in another using a SSI directive. | | | I don't write out a count of XML file accesses because I | | don't produce them. This is because the USGS search | | engine (Ultraseek) ranks the web-accessible XML files | | higher than the corresponding HTML files. I have been | | unable to keep this search engine from indexing the XML | | files, consequently I simply don't provide them. | \*----------------------------------------------------------*/ if (out = fopen ("/var/lib/httpd/htdocs/dl.html","a")) { fprintf (out,""); for (s=yesterday; *s; s++) if (*s == '/') *s = '-'; fprintf (out,"%s",yesterday); fprintf (out,"%d",met_count+txt_count); fprintf (out,"%d",html_count); fprintf (out,"%d",faq_count); fprintf (out,"%d",sgml_count); /* fprintf (out,"%d",xml_count); */ fprintf (out,"%d",dif_count); fprintf (out,"%d",met_count+txt_count+sgml_count+xml_count+html_count+faq_count+dif_count); /* fprintf (out,"%d",count_unique_hosts()); */ fprintf (out,"\n"); fclose (out); } /*----------------------------------------------------------*\ | Write a file containing all of the HTTP user agent names | | This is so we can detect new robots when they appear. | \*----------------------------------------------------------*/ sprintf (output_file,"agent-%s",yesterday); if (out = fopen (output_file,"w")) { for (i=0; i < download_count; i++) fprintf (out,"%s\n",download[i].agent); fclose (out); } /*----------------------------------------------------------*\ | Write a file containing all of the HTTP referrers. | | This is so we can study how people find records. | \*----------------------------------------------------------*/ sprintf (output_file,"refer-%s",yesterday); if (out = fopen (output_file,"w")) { for (i=0; i < download_count; i++) fprintf (out,"%s\n",download[i].refer); fclose (out); } /*----------------------------------------------------------*\ | Write a file containing all of the requesting hosts. | \*----------------------------------------------------------*/ sprintf (output_file,"host-%s",yesterday); if (out = fopen (output_file,"w")) { for (i=0; i < download_count; i++) fprintf (out,"%s\n",download[i].host); fclose (out); } /*----------------------------------------------------------*\ | Write a file containing all of the requested files. | \*----------------------------------------------------------*/ k = strlen (this_host); sprintf (output_file,"file-%s",yesterday); if (out = fopen (output_file,"w")) { for (i=0; i < download_count; i++) if (memcmp (download[i].url,this_host,k) == 0) fprintf (out,"%s\n",download[i].url + k); else fprintf (out,"%s\n",download[i].url); fclose (out); } /* Write referers for each downloaded file */ write_refs(); free (list); } free (buffer); } } /*----------------------------------------------------------------------*\ \*----------------------------------------------------------------------*/