👨‍💻 dev local search engine

development

fukurou

the supreme coder
ADMIN
pip install beautifulsoup4 requests

crawler

Python:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def crawl(start_url, max_pages=50):
    visited = set()
    to_visit = [start_url]
    index = {}  # url -> text

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)
        if url in visited:
            continue

        try:
            r = requests.get(url, timeout=5)
            soup = BeautifulSoup(r.text, "html.parser")
            text = soup.get_text(" ", strip=True)
            index[url] = text
            visited.add(url)

            # discover new links
            for link in soup.find_all("a", href=True):
                new = urljoin(url, link["href"])
                # keep only same-domain links
                if urlparse(new).netloc == urlparse(start_url).netloc:
                    if new not in visited:
                        to_visit.append(new)

        except Exception:
            continue

    return index

Python:
def search(index, query):
    results = []
    q = query.lower()

    for url, text in index.items():
        if q in text.lower():
            results.append(url)

    return results

main
Python:
if __name__ == "__main__":
    index = crawl("https://example.com", max_pages=30)
    while True:
        q = input("Search: ")
        hits = search(index, q)
        print("\nResults:")
        for h in hits:
            print(" -", h)
        print()
 

fukurou

the supreme coder
ADMIN
Python:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

class LocalSearchEngine:
    def __init__(self, start_url, max_pages=50):
        self.start_url = start_url
        self.max_pages = max_pages
        self.index = {}  # url -> text

    def crawl(self):
        visited = set()
        to_visit = [self.start_url]

        print(f"[Crawler] Starting crawl at: {self.start_url}")

        while to_visit and len(visited) < self.max_pages:
            url = to_visit.pop(0)
            if url in visited:
                continue

            print(f"[Crawler] Fetching: {url}")

            try:
                r = requests.get(url, timeout=5)
                soup = BeautifulSoup(r.text, "html.parser")
                text = soup.get_text(" ", strip=True)
                self.index[url] = text
                visited.add(url)

                # discover new links
                for link in soup.find_all("a", href=True):
                    new = urljoin(url, link["href"])
                    if urlparse(new).netloc == urlparse(self.start_url).netloc:
                        if new not in visited:
                            to_visit.append(new)

            except Exception as e:
                print(f"[Crawler] Error fetching {url}: {e}")
                continue

        print(f"[Crawler] Done. Indexed {len(self.index)} pages.\n")

    def search(self, query):
        q = query.lower()
        results = []

        for url, text in self.index.items():
            if q in text.lower():
                results.append(url)

        return results


def main():
    start_url = "https://example.com"  # change this to your domain
    engine = LocalSearchEngine(start_url, max_pages=30)

    engine.crawl()

    while True:
        q = input("Search query: ").strip()
        if not q:
            continue

        hits = engine.search(q)

        print("\nResults:")
        if not hits:
            print("  No matches found.")
        else:
            for h in hits:
                print(" -", h)
        print()


if __name__ == "__main__":
    main()
 

fukurou

the supreme coder
ADMIN
unique site finder search engine:
Python:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

def get_registered_domain(url):
    """
    Extracts the registered domain:
    pl.wikipedia.org -> wikipedia.org
    en.m.wikipedia.org -> wikipedia.org
    news.bbc.co.uk -> bbc.co.uk
    """
    host = urlparse(url).netloc.lower()
    parts = host.split(".")
    if len(parts) >= 2:
        return ".".join(parts[-2:])  # last two parts
    return host

class DomainCrawler:
    def __init__(self, seeds, max_domains=200):
        self.to_visit = list(seeds)
        self.visited_domains = set()
        self.index = {}
        self.max_domains = max_domains

    def crawl(self):
        print(f"[Crawler] Starting with {len(self.to_visit)} seeds")

        while self.to_visit and len(self.visited_domains) < self.max_domains:
            url = self.to_visit.pop(0)
            domain = get_registered_domain(url)

            if domain in self.visited_domains:
                continue

            print(f"[Crawler] Visiting domain: {domain}  ({url})")
            self.visited_domains.add(domain)

            try:
                r = requests.get(url, timeout=5, headers={"User-Agent": "Mozilla/5.0"})
                if "text/html" not in r.headers.get("Content-Type", ""):
                    continue

                soup = BeautifulSoup(r.text, "html.parser")
                text = soup.get_text(" ", strip=True)
                self.index[url] = text

                # discover new domains
                for link in soup.find_all("a", href=True):
                    new = urljoin(url, link["href"]).split("#")[0]
                    if not new.startswith("http"):
                        continue

                    new_domain = get_registered_domain(new)

                    # only follow if domain is new
                    if new_domain not in self.visited_domains:
                        self.to_visit.append(new)

            except Exception:
                continue

            time.sleep(0.2)

        print(f"[Crawler] Done. Found {len(self.visited_domains)} unique domains.\n")

    def search(self, query):
        q = query.lower()
        return [url for url, text in self.index.items() if q in text.lower()]


def main():
    seeds = [
        "https://example.com",
        "https://wikipedia.org",
        "https://news.ycombinator.com"
    ]

    engine = DomainCrawler(seeds, max_domains=200)
    engine.crawl()

    while True:
        q = input("Search: ").strip()
        hits = engine.search(q)
        print("\nResults:")
        for h in hits:
            print(" -", h)
        print()


if __name__ == "__main__":
    main()
 

fukurou

the supreme coder
ADMIN
Python:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

def get_registered_domain(url):
    """
    Extracts the registered domain:
    pl.wikipedia.org -> wikipedia.org
    en.m.wikipedia.org -> wikipedia.org
    news.bbc.co.uk -> bbc.co.uk
    """
    host = urlparse(url).netloc.lower()
    parts = host.split(".")
    if len(parts) >= 2:
        return ".".join(parts[-2:])  # last two parts
    return host

class DomainCrawler:
    def __init__(self, seeds, max_domains=200):
        self.to_visit = list(seeds)
        self.visited_domains = set()
        self.index = {}
        self.max_domains = max_domains

    def crawl(self):
        print(f"[Crawler] Starting with {len(self.to_visit)} seeds")

        while self.to_visit and len(self.visited_domains) < self.max_domains:
            url = self.to_visit.pop(0)
            domain = get_registered_domain(url)

            if domain in self.visited_domains:
                continue

            print(f"[Crawler] Visiting domain: {domain}  ({url})")
            self.visited_domains.add(domain)

            try:
                r = requests.get(url, timeout=5, headers={"User-Agent": "Mozilla/5.0"})
                if "text/html" not in r.headers.get("Content-Type", ""):
                    continue

                soup = BeautifulSoup(r.text, "html.parser")
                text = soup.get_text(" ", strip=True)
                self.index[url] = text

                # discover new domains
                for link in soup.find_all("a", href=True):
                    new = urljoin(url, link["href"]).split("#")[0]
                    if not new.startswith("http"):
                        continue

                    new_domain = get_registered_domain(new)

                    # only follow if domain is new
                    if new_domain not in self.visited_domains:
                        self.to_visit.append(new)

            except Exception:
                continue

            time.sleep(0.2)

        print(f"[Crawler] Done. Found {len(self.visited_domains)} unique domains.\n")

    def search(self, query):
        q = query.lower()
        return [url for url, text in self.index.items() if q in text.lower()]


def main():
    seeds = [
        "https://example.com",
        "https://wikipedia.org",
        "https://news.ycombinator.com"
    ]

    engine = DomainCrawler(seeds, max_domains=200)
    engine.crawl()

    while True:
        q = input("Search: ").strip()
        if q == "exit":
            break
        hits = engine.search(q)
        print("\nResults:")
        for h in hits:
            print(" -", h)
        print()


if __name__ == "__main__":
    main()
 
Top