#!/usr/bin/env python import sys import os import os.path import datetime import urllib def set_path_to_package_root(): """ Calling this function esures that: 1) if this script is placed inside the C{concurrent_tree_crawler} package files, when importing the C{concurrent_tree_crawler} package it uses these package files, 2) otherwise, in situation when the script is in some other place and we want to use the library C{concurrent_tree_crawler} installed in the system, it uses this library when importing the C{concurrent_tree_crawler}. """ import sys import os.path sys.path[0]=os.path.join(sys.path[0], '../..') set_path_to_package_root() def get_website_address(): """ This function is used to fetch the path to the location of the sample web site from the package. """ import concurrent_tree_crawler package_path = os.path.dirname(concurrent_tree_crawler.__file__) url = urllib.pathname2url("{}/test/data/original_site/issues_1.html".\ format(package_path)) return "file:"+url website_address = get_website_address() script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) dst_dir = "./tmp" now = datetime.datetime.now() sleep_time = (now + datetime.timedelta(seconds=4), now + datetime.timedelta(seconds=9)) sleep_time_str = ("{}:{}:{}".format(sleep_time[0].hour, sleep_time[0].minute, sleep_time[0].second), "{}:{}:{}".format(sleep_time[1].hour, sleep_time[1].minute, sleep_time[1].second)) pages_per_second_download_limit = 4 os.system('{script_dir}/sample_download_crawler.py -v -v --log_file "{dst_dir}/log.txt" --max_pages_per_second {download_limit} --daily_schedule {activity_start}-{activity_end} "{dst_dir}/state.xml" {website_address} "{dst_dir}/tmp/download" '.format(download_limit=pages_per_second_download_limit, script_dir=script_dir, dst_dir=dst_dir, activity_start=sleep_time_str[1], activity_end=sleep_time_str[0], website_address=website_address)) print "\nNote that if above an information about problems during tree exploration has been printed, it is expected. It stems from the fact that some of the pages we want to download from our testing web site are missing."