diff --git a/src/uwhoisd/scraper.py b/src/uwhoisd/scraper.py index 5135fa4..6da6e34 100644 --- a/src/uwhoisd/scraper.py +++ b/src/uwhoisd/scraper.py @@ -40,7 +40,7 @@ def fetch_ipv4_assignments(url: str): yield prefix, whois -def fetch(session: requests.Session, url: str): +def fetch(session: requests.Session, url: str) -> BeautifulSoup: """ Fetch a URL and parse it with Beautiful Soup for scraping. """ @@ -55,7 +55,7 @@ def munge_zone(zone: str) -> str: return zone.strip("\u200e\u200f.").encode("idna").decode().lower() -def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]): +def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]) -> t.Iterator[t.Tuple[str, str]]: """ Scrape IANA's root zone database for WHOIS servers. """ @@ -64,46 +64,52 @@ def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]) logging.info("Scraping %s", root_zone_db_url) body = fetch(session, root_zone_db_url) - for link in body.select("#tld-table .tld a"): - if "href" not in link.attrs or link.string is None: - continue - - zone = munge_zone(link.string) + for zone, zone_url in extract_zone_urls(root_zone_db_url, body): # If we've already scraped this TLD, ignore it. if zone in existing: yield (zone, existing[zone]) continue - # Is this a zone we should skip/ignore? - row = link.parent.parent.parent.findChildren("td") - if row[1].string == "test": - continue - if row[2].string in ("Not assigned", "Retired"): - continue - - zone_url = urljoin(root_zone_db_url, link.attrs["href"]) logging.info("Scraping %s", zone_url) body = fetch(session, zone_url) - - whois_server_label = body.find("b", string="WHOIS Server:") - whois_server = "" - if whois_server_label is not None: - whois_server = whois_server_label.next_sibling.strip().lower() - + whois_server = extract_whois_server(body) # Fallback to trying whois.nic.* - if whois_server == "": + if whois_server is None: whois_server = f"whois.nic.{zone}" logging.info("Trying fallback server: %s", whois_server) try: socket.gethostbyname(whois_server) except socket.gaierror: - whois_server = "" + logging.info("No WHOIS server found for %s", zone) + continue + + logging.info("WHOIS server for %s is %s", zone, whois_server) + yield (zone, whois_server) + + +def extract_zone_urls(base_url: str, body: BeautifulSoup) -> t.Iterator[t.Tuple[str, str]]: + for link in body.select("#tld-table .tld a"): + if "href" not in link.attrs or link.string is None: # pragma: no cover + continue + row = link.find_parent("tr") + if row is None: # pragma: no cover + continue + tds = row.find_all("td") + # Is this a zone we should skip/ignore? + if tds[1].string == "test": + continue + if tds[2].string in ("Not assigned", "Retired"): + continue + + yield (munge_zone(link.string), urljoin(base_url, link.attrs["href"])) + - if whois_server == "": - logging.info("No WHOIS server found for %s", zone) - else: - logging.info("WHOIS server for %s is %s", zone, whois_server) - yield (zone, whois_server) +def extract_whois_server(body: BeautifulSoup) -> t.Optional[str]: + whois_server_label = body.find("b", string="WHOIS Server:") + if whois_server_label is None or whois_server_label.next_sibling is None: + return None + server = whois_server_label.next_sibling.text.strip().lower() + return None if server == "" else server def make_arg_parser() -> argparse.ArgumentParser: diff --git a/tests/iana-root-zone.html b/tests/iana-root-zone.html new file mode 100644 index 0000000..974b556 --- /dev/null +++ b/tests/iana-root-zone.html @@ -0,0 +1,69 @@ + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DomainTypeTLD Manager
.brokenbrokenThis is just here for coverage
.aaagenericAmerican Automobile Association, + Inc.
.btcountry-codeMinistry of Information and + Communications
.dummygenericNot assigned
.sillygenericRetired
.테스트testNot assigned
.xxxsponsoredICM Registry LLC
+
+
+
+
+ + diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 0000000..e46cb9f --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,46 @@ +from os import path + +import bs4 +import pytest + +from uwhoisd import scraper + +HERE = path.dirname(__file__) + + +def test_extract_zone_urls(): + with open(path.join(path.dirname(__file__), "iana-root-zone.html"), encoding="utf-8") as fh: + body = bs4.BeautifulSoup(fh, "html.parser") + result = list(scraper.extract_zone_urls("http://example.com", body)) + # The test zone should not appear + assert result == [ + ("aaa", "http://example.com/domains/root/db/aaa.html"), + ("bt", "http://example.com/domains/root/db/bt.html"), + ("xxx", "http://example.com/domains/root/db/xxx.html"), + ] + + +def test_extract_zone_urls_edge_cases(): + empty_body = bs4.BeautifulSoup("", "html.parser") + assert list(scraper.extract_zone_urls("http://example.com", empty_body)) == [] + + +def test_extract_whois_server(): + with open(path.join(path.dirname(__file__), "zone-info-fragment.html"), encoding="utf-8") as fh: + body = bs4.BeautifulSoup(fh, "html.parser") + result = scraper.extract_whois_server(body) + assert result == "whois.nic.abc" + + +@pytest.mark.parametrize( + "fragment", + [ + "", + "WHOIS Server: ", + "WHOIS Server:", + ], +) +def test_extract_whois_server_no_matches(fragment): + body = bs4.BeautifulSoup(fragment, "html.parser") + result = scraper.extract_whois_server(body) + assert result is None diff --git a/tests/zone-info-fragment.html b/tests/zone-info-fragment.html new file mode 100644 index 0000000..90831b4 --- /dev/null +++ b/tests/zone-info-fragment.html @@ -0,0 +1,11 @@ + + + +

Registry Information

+

+ URL for registration services: http://abc.com
+ WHOIS Server: whois.nic.abc +

+ +