diff --git a/src/uwhoisd/scraper.py b/src/uwhoisd/scraper.py
index 5135fa4..6da6e34 100644
--- a/src/uwhoisd/scraper.py
+++ b/src/uwhoisd/scraper.py
@@ -40,7 +40,7 @@ def fetch_ipv4_assignments(url: str):
yield prefix, whois
-def fetch(session: requests.Session, url: str):
+def fetch(session: requests.Session, url: str) -> BeautifulSoup:
"""
Fetch a URL and parse it with Beautiful Soup for scraping.
"""
@@ -55,7 +55,7 @@ def munge_zone(zone: str) -> str:
return zone.strip("\u200e\u200f.").encode("idna").decode().lower()
-def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]):
+def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]) -> t.Iterator[t.Tuple[str, str]]:
"""
Scrape IANA's root zone database for WHOIS servers.
"""
@@ -64,46 +64,52 @@ def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str])
logging.info("Scraping %s", root_zone_db_url)
body = fetch(session, root_zone_db_url)
- for link in body.select("#tld-table .tld a"):
- if "href" not in link.attrs or link.string is None:
- continue
-
- zone = munge_zone(link.string)
+ for zone, zone_url in extract_zone_urls(root_zone_db_url, body):
# If we've already scraped this TLD, ignore it.
if zone in existing:
yield (zone, existing[zone])
continue
- # Is this a zone we should skip/ignore?
- row = link.parent.parent.parent.findChildren("td")
- if row[1].string == "test":
- continue
- if row[2].string in ("Not assigned", "Retired"):
- continue
-
- zone_url = urljoin(root_zone_db_url, link.attrs["href"])
logging.info("Scraping %s", zone_url)
body = fetch(session, zone_url)
-
- whois_server_label = body.find("b", string="WHOIS Server:")
- whois_server = ""
- if whois_server_label is not None:
- whois_server = whois_server_label.next_sibling.strip().lower()
-
+ whois_server = extract_whois_server(body)
# Fallback to trying whois.nic.*
- if whois_server == "":
+ if whois_server is None:
whois_server = f"whois.nic.{zone}"
logging.info("Trying fallback server: %s", whois_server)
try:
socket.gethostbyname(whois_server)
except socket.gaierror:
- whois_server = ""
+ logging.info("No WHOIS server found for %s", zone)
+ continue
+
+ logging.info("WHOIS server for %s is %s", zone, whois_server)
+ yield (zone, whois_server)
+
+
+def extract_zone_urls(base_url: str, body: BeautifulSoup) -> t.Iterator[t.Tuple[str, str]]:
+ for link in body.select("#tld-table .tld a"):
+ if "href" not in link.attrs or link.string is None: # pragma: no cover
+ continue
+ row = link.find_parent("tr")
+ if row is None: # pragma: no cover
+ continue
+ tds = row.find_all("td")
+ # Is this a zone we should skip/ignore?
+ if tds[1].string == "test":
+ continue
+ if tds[2].string in ("Not assigned", "Retired"):
+ continue
+
+ yield (munge_zone(link.string), urljoin(base_url, link.attrs["href"]))
+
- if whois_server == "":
- logging.info("No WHOIS server found for %s", zone)
- else:
- logging.info("WHOIS server for %s is %s", zone, whois_server)
- yield (zone, whois_server)
+def extract_whois_server(body: BeautifulSoup) -> t.Optional[str]:
+ whois_server_label = body.find("b", string="WHOIS Server:")
+ if whois_server_label is None or whois_server_label.next_sibling is None:
+ return None
+ server = whois_server_label.next_sibling.text.strip().lower()
+ return None if server == "" else server
def make_arg_parser() -> argparse.ArgumentParser:
diff --git a/tests/iana-root-zone.html b/tests/iana-root-zone.html
new file mode 100644
index 0000000..974b556
--- /dev/null
+++ b/tests/iana-root-zone.html
@@ -0,0 +1,69 @@
+
+
+
+
+
+
+
+
+
+
+ Domain |
+ Type |
+ TLD Manager |
+
+
+
+
+ .broken |
+ broken |
+ This is just here for coverage |
+
+
+ .aaa |
+ generic |
+ American Automobile Association,
+ Inc. |
+
+
+ .bt |
+ country-code |
+ Ministry of Information and
+ Communications |
+
+
+ .dummy |
+ generic |
+ Not assigned |
+
+
+ .silly |
+ generic |
+ Retired |
+
+
+
+ .테스트 |
+ test |
+ Not assigned |
+
+
+ .xxx |
+ sponsored |
+ ICM Registry LLC |
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
new file mode 100644
index 0000000..e46cb9f
--- /dev/null
+++ b/tests/test_scraper.py
@@ -0,0 +1,46 @@
+from os import path
+
+import bs4
+import pytest
+
+from uwhoisd import scraper
+
+HERE = path.dirname(__file__)
+
+
+def test_extract_zone_urls():
+ with open(path.join(path.dirname(__file__), "iana-root-zone.html"), encoding="utf-8") as fh:
+ body = bs4.BeautifulSoup(fh, "html.parser")
+ result = list(scraper.extract_zone_urls("http://example.com", body))
+ # The test zone should not appear
+ assert result == [
+ ("aaa", "http://example.com/domains/root/db/aaa.html"),
+ ("bt", "http://example.com/domains/root/db/bt.html"),
+ ("xxx", "http://example.com/domains/root/db/xxx.html"),
+ ]
+
+
+def test_extract_zone_urls_edge_cases():
+ empty_body = bs4.BeautifulSoup("", "html.parser")
+ assert list(scraper.extract_zone_urls("http://example.com", empty_body)) == []
+
+
+def test_extract_whois_server():
+ with open(path.join(path.dirname(__file__), "zone-info-fragment.html"), encoding="utf-8") as fh:
+ body = bs4.BeautifulSoup(fh, "html.parser")
+ result = scraper.extract_whois_server(body)
+ assert result == "whois.nic.abc"
+
+
+@pytest.mark.parametrize(
+ "fragment",
+ [
+ "",
+ "WHOIS Server: ",
+ "WHOIS Server:",
+ ],
+)
+def test_extract_whois_server_no_matches(fragment):
+ body = bs4.BeautifulSoup(fragment, "html.parser")
+ result = scraper.extract_whois_server(body)
+ assert result is None
diff --git a/tests/zone-info-fragment.html b/tests/zone-info-fragment.html
new file mode 100644
index 0000000..90831b4
--- /dev/null
+++ b/tests/zone-info-fragment.html
@@ -0,0 +1,11 @@
+
+
+
+ Registry Information
+
+ URL for registration services: http://abc.com
+ WHOIS Server: whois.nic.abc
+
+
+