123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- import os
- from bs4 import BeautifulSoup
- import json
- import sys
- import time
- import requests
- from datetime import datetime
- def crawl_url(url):
- """
- Crawls the given URL and returns the response time and any internal links found.
- """
- start_time = datetime.now()
- response = requests.get(url)
- soup = None
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- internal_links = []
- if soup:
- for link in soup.find_all('a'):
- href = link.get('href')
- if href and not href.startswith('http'):
- internal_links.append(href)
- response_time = (datetime.now() - start_time).total_seconds() * 1000
- return response_time, internal_links
- def main():
- script_dir = os.environ.get('REQUESTS_SPIDER_DIR', '/requests_spider/')
- interval = int(os.environ.get('REQUEST_SPIDER_INTERVAL', '10'))
- forks = int(os.environ.get('REQUEST_SPIDER_FORKS', '1'))
- while True:
- urls_file = os.path.join(script_dir, 'urls.json')
- if not os.path.isfile(urls_file):
- print(f"Error: {urls_file} not found.")
- sys.exit(1)
- with open(urls_file) as f:
- urls = json.load(f)
- runs = []
- for url in urls:
- for i in range(1, forks + 1):
- time.sleep(0.5) # add a delay to be polite
- response_time, internal_links = crawl_url(url)
- runs.append({
- "url": url,
- "fork": i,
- "start_time": datetime.now().timestamp(),
- "end_time": None,
- "exit_status": None,
- "response_time": response_time,
- "internal_links": internal_links,
- })
- time.sleep(interval)
- for run in runs:
- if run["end_time"] is not None:
- continue
- run["end_time"] = datetime.now().timestamp()
- # Print the results
- for run in runs:
- print(f"URL: {run['url']}\tFork: {run['fork']}\tStart Time: {datetime.fromtimestamp(run['start_time'])}\tEnd Time: {datetime.fromtimestamp(run['end_time'])}\tResponse Time: {run['response_time']}ms\tInternal Links: {run['internal_links']}")
- if __name__ == '__main__':
- main()
|