Python:
import requests
from bs4 import BeautifulSoup
from collections import Counter
from typing import List
mainstream_sites = ['google.com', 'bing.com', 'facebook.com', 'youtube.com']
def crawl_web(url: str) -> List[str]:
try:
response = requests.get(url)
response.raise_for_status() # Check for request errors
soup = BeautifulSoup(response.content, 'html.parser')
links = [a['href'] for a in soup.find_all('a', href=True)]
return links
except requests.RequestException as e:
print(f"Error crawling {url}: {e}")
return []
def filter_mainstream_sites(links: List[str]) -> List[str]:
return [link for link in links if not any(mainstream_site in link for mainstream_site in mainstream_sites)]
def rank_sites(links: List[str]) -> List[str]:
site_counter = Counter(links)
ranked_sites = site_counter.most_common()
return ranked_sites
def search_engine(*seed_sites: str) -> List[str]:
all_links = []
for seed_site in seed_sites:
links = crawl_web(seed_site)
filtered_links = filter_mainstream_sites(links)
all_links.extend(filtered_links)
ranked_sites = rank_sites(all_links)
return ranked_sites[:10] # Return top 10 non-mainstream sites
# Usage example:
seed_sites = ['http://example1.com', 'http://example2.com']
results = search_engine(*seed_sites)
for site, count in results:
print(f'{site} (count: {count})')