Python:
import requests
from bs4 import BeautifulSoup
from queue import Queue
from typing import List, Set, Tuple, Optional
def fetch_page(url: str) -> Optional[str]:
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
def extract_links(html: str, base_url: str) -> Set[str]:
soup = BeautifulSoup(html, 'html.parser')
links = set()
for tag in soup.find_all('a', href=True):
link = tag['href']
if link.startswith('/'):
link = base_url + link
if link.startswith('http'):
links.add(link)
return links
def keyword_found(html: str, keywords: List[str]) -> bool:
return any(keyword in html for keyword in keywords)
def negative_keyword_found(html: str, negative_keywords: List[str]) -> bool:
return any(keyword in html for keyword in negative_keywords)
def web_crawler(seed_sites: List[str], keywords: List[str], negative_keywords: List[str], max_depth: int = 2) -> None:
visited: Set[str] = set()
queue: Queue[Tuple[str, int]] = Queue()
for seed in seed_sites:
queue.put((seed, 0))
while not queue.empty():
current_url, depth = queue.get()
if depth > max_depth or current_url in visited:
continue
html = fetch_page(current_url)
if html:
if keyword_found(html, keywords) and not negative_keyword_found(html, negative_keywords):
print(f"Found a match: {current_url}")
for link in extract_links(html, current_url):
if link not in visited:
queue.put((link, depth + 1))
visited.add(current_url)
# Example usage:
seed_sites = ['https://example.com']
keywords = ['keyword1', 'keyword2']
negative_keywords = ['negative1', 'negative2']
web_crawler(seed_sites, keywords, negative_keywords)