requests_spider.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import os
  2. from bs4 import BeautifulSoup
  3. import json
  4. import sys
  5. import time
  6. import requests
  7. from datetime import datetime
  8. def crawl_url(url):
  9. """
  10. Crawls the given URL and returns the response time and any internal links found.
  11. """
  12. start_time = datetime.now()
  13. response = requests.get(url)
  14. soup = None
  15. if response.status_code == 200:
  16. soup = BeautifulSoup(response.text, 'html.parser')
  17. internal_links = []
  18. if soup:
  19. for link in soup.find_all('a'):
  20. href = link.get('href')
  21. if href and not href.startswith('http'):
  22. internal_links.append(href)
  23. response_time = (datetime.now() - start_time).total_seconds() * 1000
  24. return response_time, internal_links
  25. def main():
  26. script_dir = os.environ.get('REQUESTS_SPIDER_DIR', '/requests_spider/')
  27. interval = int(os.environ.get('REQUEST_SPIDER_INTERVAL', '10'))
  28. forks = int(os.environ.get('REQUEST_SPIDER_FORKS', '1'))
  29. while True:
  30. urls_file = os.path.join(script_dir, 'urls.json')
  31. if not os.path.isfile(urls_file):
  32. print(f"Error: {urls_file} not found.")
  33. sys.exit(1)
  34. with open(urls_file) as f:
  35. urls = json.load(f)
  36. runs = []
  37. for url in urls:
  38. for i in range(1, forks + 1):
  39. time.sleep(0.5) # add a delay to be polite
  40. response_time, internal_links = crawl_url(url)
  41. runs.append({
  42. "url": url,
  43. "fork": i,
  44. "start_time": datetime.now().timestamp(),
  45. "end_time": None,
  46. "exit_status": None,
  47. "response_time": response_time,
  48. "internal_links": internal_links,
  49. })
  50. time.sleep(interval)
  51. for run in runs:
  52. if run["end_time"] is not None:
  53. continue
  54. run["end_time"] = datetime.now().timestamp()
  55. # Print the results
  56. for run in runs:
  57. print(f"URL: {run['url']}\tFork: {run['fork']}\tStart Time: {datetime.fromtimestamp(run['start_time'])}\tEnd Time: {datetime.fromtimestamp(run['end_time'])}\tResponse Time: {run['response_time']}ms\tInternal Links: {run['internal_links']}")
  58. if __name__ == '__main__':
  59. main()