pip install beautifulsoup4 requests
crawler
main
crawler
Python:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def crawl(start_url, max_pages=50):
visited = set()
to_visit = [start_url]
index = {} # url -> text
while to_visit and len(visited) < max_pages:
url = to_visit.pop(0)
if url in visited:
continue
try:
r = requests.get(url, timeout=5)
soup = BeautifulSoup(r.text, "html.parser")
text = soup.get_text(" ", strip=True)
index[url] = text
visited.add(url)
# discover new links
for link in soup.find_all("a", href=True):
new = urljoin(url, link["href"])
# keep only same-domain links
if urlparse(new).netloc == urlparse(start_url).netloc:
if new not in visited:
to_visit.append(new)
except Exception:
continue
return index
Python:
def search(index, query):
results = []
q = query.lower()
for url, text in index.items():
if q in text.lower():
results.append(url)
return results
main
Python:
if __name__ == "__main__":
index = crawl("https://example.com", max_pages=30)
while True:
q = input("Search: ")
hits = search(index, q)
print("\nResults:")
for h in hits:
print(" -", h)
print()