35 #include <BESCatalogList.h>
36 #include <BESCatalogUtils.h>
37 #include <CatalogItem.h>
39 #include "RemoteResource.h"
40 #include "HttpdCatalogNames.h"
42 #include "HttpdDirScraper.h"
47 #define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
49 namespace httpd_catalog {
51 HttpdDirScraper::HttpdDirScraper()
54 d_months.insert(pair<string, int>(
string(
"jan"), 0));
55 d_months.insert(pair<string, int>(
string(
"feb"), 1));
56 d_months.insert(pair<string, int>(
string(
"mar"), 2));
57 d_months.insert(pair<string, int>(
string(
"apr"), 3));
58 d_months.insert(pair<string, int>(
string(
"may"), 4));
59 d_months.insert(pair<string, int>(
string(
"jun"), 5));
60 d_months.insert(pair<string, int>(
string(
"jul"), 6));
61 d_months.insert(pair<string, int>(
string(
"aug"), 7));
62 d_months.insert(pair<string, int>(
string(
"sep"), 8));
63 d_months.insert(pair<string, int>(
string(
"oct"), 9));
64 d_months.insert(pair<string, int>(
string(
"nov"), 10));
65 d_months.insert(pair<string, int>(
string(
"dec"), 11));
72 long HttpdDirScraper::get_size_val(
const string size_str)
const
74 char scale_c = *size_str.rbegin();
97 BESDEBUG(MODULE, prolog <<
"scale: " << scale << endl);
99 string result = size_str;
100 if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
102 long size = atol(result.c_str());
103 BESDEBUG(MODULE, prolog <<
"raw size: " << size << endl);
106 BESDEBUG(MODULE, prolog <<
"scaled size: " << size << endl);
113 string show_tm_struct(
const tm tms)
116 ss <<
"tm_sec: " << tms.tm_sec << endl;
117 ss <<
"tm_min: " << tms.tm_min << endl;
118 ss <<
"tm_hour: " << tms.tm_hour << endl;
119 ss <<
"tm_mday: " << tms.tm_mday << endl;
120 ss <<
"tm_mon: " << tms.tm_mon << endl;
121 ss <<
"tm_year: " << tms.tm_year << endl;
122 ss <<
"tm_wday: " << tms.tm_wday << endl;
123 ss <<
"tm_yday: " << tms.tm_yday << endl;
124 ss <<
"tm_isdst: " << tms.tm_isdst << endl;
131 void zero_tm_struct(tm &tms)
145 string HttpdDirScraper::httpd_time_to_iso_8601(
const string httpd_time)
const
147 vector<string> tokens;
148 string delimiters =
"- :";
151 BESDEBUG(MODULE, prolog <<
"Found " << tokens.size() <<
" tokens." << endl);
152 vector<string>::iterator it = tokens.begin();
155 while (it != tokens.end()) {
156 BESDEBUG(MODULE, prolog <<
" token["<< i++ <<
"]: "<< *it << endl);
161 BESDEBUG(MODULE, prolog <<
"Second Field: "<< tokens[1] << endl);
163 const char *second_field = tokens[1].c_str();
164 bool is_alpha =
true;
165 for(
unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166 is_alpha = isalpha(second_field[i]);
170 BESDEBUG(MODULE, prolog <<
"Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171 theTime = parse_time_format_A(tokens);
174 BESDEBUG(MODULE, prolog <<
"Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175 theTime = parse_time_format_B(tokens);
186 time_t HttpdDirScraper::parse_time_format_A(
const vector<string> tokens)
const
192 if (tokens.size() > 2) {
193 std::istringstream(tokens[0]) >> tm.tm_mday;
194 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
197 BESDEBUG(MODULE, prolog <<
" mnth.first: "<< mnth.first << endl);
198 BESDEBUG(MODULE, prolog <<
" mnth.second: "<< mnth.second << endl);
199 tm.tm_mon = mnth.second;
200 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
202 std::istringstream(tokens[2]) >> tm.tm_year;
204 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
206 if (tokens.size() > 4) {
207 std::istringstream(tokens[3]) >> tm.tm_hour;
208 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
209 std::istringstream(tokens[4]) >> tm.tm_min;
210 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
214 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
216 time_t theTime = mktime(&tm);
217 BESDEBUG(MODULE, prolog <<
"theTime: " << theTime << endl);
226 time_t HttpdDirScraper::parse_time_format_B(
const vector<string> tokens)
const
232 if (tokens.size() > 2) {
233 std::istringstream(tokens[0]) >> tm.tm_year;
235 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
237 std::istringstream(tokens[1]) >> tm.tm_mon;
238 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
240 std::istringstream(tokens[2]) >> tm.tm_mday;
241 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
243 if (tokens.size() > 4) {
244 std::istringstream(tokens[3]) >> tm.tm_hour;
245 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
246 std::istringstream(tokens[4]) >> tm.tm_min;
247 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
251 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
253 time_t theTime = mktime(&tm);
254 BESDEBUG(MODULE, prolog <<
"ISO-8601 Time: " << theTime << endl);
274 void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items)
const
280 rhr.retrieveResource();
283 ifstream cache_file_is(rhr.getCacheFileName().c_str());
284 if(!cache_file_is.is_open()){
285 string msg = prolog +
"ERROR - Failed to open cache file: " + rhr.getCacheFileName();
286 BESDEBUG(MODULE, msg << endl);
290 buffer << cache_file_is.rdbuf();
291 string pageStr = buffer.str();
292 BESDEBUG(MODULE, prolog <<
"Page Content: " << endl << pageStr << endl);
295 if(pageStr.find(
"<title>Index of ") == string::npos){
297 BESDEBUG(MODULE, prolog <<
"The url: " << url <<
" does not appear to reference an Apache httpd Index page." << endl);
301 string aOpenStr =
"<a ";
302 string aCloseStr =
"</a>";
303 string hrefStr =
"href=\"";
304 string tdOpenStr =
"<td ";
305 string tdCloseStr =
"</td>";
307 BESRegex hrefExcludeRegex(
"(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
308 BESRegex nameExcludeRegex(
"^Parent Directory$");
313 int aOpenIndex = pageStr.find(aOpenStr, next_start);
314 if (aOpenIndex < 0) {
318 int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
319 if (aCloseIndex < 0) {
326 BESDEBUG(MODULE, prolog <<
"aOpenIndex: " << aOpenIndex << endl);
327 BESDEBUG(MODULE, prolog <<
"aCloseIndex: " << aCloseIndex << endl);
328 length = aCloseIndex + aCloseStr.length() - aOpenIndex;
329 string aElemStr = pageStr.substr(aOpenIndex, length);
330 BESDEBUG(MODULE, prolog <<
"Processing link: " << aElemStr << endl);
333 int start = aElemStr.find(
">") + 1;
334 int end = aElemStr.find(
"<", start);
335 length = end - start;
336 string linkText = aElemStr.substr(start, length);
337 BESDEBUG(MODULE, prolog <<
"Link Text: " << linkText << endl);
340 start = aElemStr.find(hrefStr) + hrefStr.length();
341 end = aElemStr.find(
"\"", start);
342 length = end - start;
343 string href = aElemStr.substr(start, length);
344 BESDEBUG(MODULE, prolog <<
"href: " << href << endl);
348 int start_pos = getNextElementText(pageStr,
"td", aCloseIndex + aCloseStr.length(), time_str);
349 BESDEBUG(MODULE, prolog <<
"time_str: '" << time_str <<
"'" << endl);
353 start_pos = getNextElementText(pageStr,
"td", start_pos, size_str);
354 BESDEBUG(MODULE, prolog <<
"size_str: '" << size_str <<
"'" << endl);
356 if ((linkText.find(
"<img") != string::npos) || !(linkText.length()) || (linkText.find(
"<<<") != string::npos)
357 || (linkText.find(
">>>") != string::npos)) {
358 BESDEBUG(MODULE, prolog <<
"SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
361 if (href.length() == 0 || (((href.find(
"http://") == 0) || (href.find(
"https://") == 0)) && !(href.find(url) == 0))) {
363 BESDEBUG(MODULE, prolog <<
"SKIPPING(null or remote): " << href << endl);
365 else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
367 BESDEBUG(MODULE, prolog <<
"SKIPPING(hrefExcludeRegex) - href: '" << href <<
"'"<< endl);
369 else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
371 BESDEBUG(MODULE, prolog <<
"SKIPPING(nameExcludeRegex) - name: '" << linkText <<
"'" << endl);
374 string node_name = href.substr(0, href.length() - 1);
376 BESDEBUG(MODULE, prolog <<
"NODE: " << node_name << endl);
378 childNode->
set_type(CatalogItem::node);
381 string iso_8601_time = httpd_time_to_iso_8601(time_str);
382 childNode->
set_lmt(iso_8601_time);
384 long size = get_size_val(size_str);
387 items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
391 BESDEBUG(MODULE, prolog <<
"LEAF: " << href << endl);
393 leafItem->
set_type(CatalogItem::leaf);
396 string iso_8601_time = httpd_time_to_iso_8601(time_str);
397 leafItem->
set_lmt(iso_8601_time);
398 long size = get_size_val(size_str);
401 items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
405 next_start = aCloseIndex + aCloseStr.length();
422 int HttpdDirScraper::getNextElementText(
const string &page_str,
const string element_name,
int startIndex,
string &resultText,
bool trim)
const
424 string e_open_str =
"<" + element_name +
" ";
425 string e_close_str =
"</" + element_name +
">";
428 int start = page_str.find(e_open_str, startIndex);
429 int end = page_str.find(e_close_str, start + e_open_str.length());
430 if(start<0 || end<0 || end<start){
435 int length = end + e_close_str.length() - start;
436 string element_str = page_str.substr(start, length);
439 start = element_str.find(
">") + 1;
440 end = element_str.find(
"<", start);
441 length = end - start;
442 resultText = element_str.substr(start, length);
446 BESDEBUG(MODULE, prolog <<
"resultText: '" << resultText <<
"'" << endl);
447 return startIndex + element_str.length();
457 bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
459 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
464 map<string, bes::CatalogItem *> items;
465 createHttpdDirectoryPageMap(url, items);
467 BESDEBUG(MODULE, prolog <<
"Found " << items.size() <<
" items." << endl);
468 map<string, bes::CatalogItem *>::iterator it;
470 while (it != items.end()) {
472 BESDEBUG(MODULE, prolog <<
"Adding item: '" << item->
get_name() <<
"'"<< endl);
473 if (item->
get_type() == CatalogItem::node)
474 node->add_node(item);
476 node->add_leaf(item);
483 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
484 string leaf_name = url_parts.back();
497 node->set_leaf(item);
504 bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
506 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
511 set<string> pageNodes;
512 set<string> pageLeaves;
513 createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
515 BESDEBUG(MODULE, prolog <<
"Found " << pageNodes.size() <<
" nodes." << endl);
516 BESDEBUG(MODULE, prolog <<
"Found " << pageLeaves.size() <<
" leaves." << endl);
518 set<string>::iterator it;
520 it = pageNodes.begin();
521 while (it != pageNodes.end()) {
522 string pageNode = *it;
523 if (
BESUtil::endsWith(pageNode,
"/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
526 childNode->
set_type(CatalogItem::node);
537 node->add_node(childNode);
541 it = pageLeaves.begin();
542 while (it != pageLeaves.end()) {
545 leafItem->
set_type(CatalogItem::leaf);
557 node->add_leaf(leafItem);
562 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
563 string leaf_name = url_parts.back();
574 node->set_leaf(item);
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
exception thrown if internal error encountered
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
static bool endsWith(std::string const &fullString, std::string const &ending)
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
static std::string lowercase(const std::string &s)
static void removeLeadingAndTrailingBlanks(std::string &key)
static std::string get_time(bool use_local_time=false)
void set_name(std::string n)
Set the name of the item.
std::string get_name() const
The name of this item in the node.
void set_size(size_t s)
Set the size of the item.
void set_is_data(bool id)
Is this item data that the BES should interpret?
void set_lmt(std::string lmt)
Set the LMT for this item.
item_type get_type() const
Get the type of this item (unknown, node or leaf)
void set_type(item_type t)
Set the type for this item.