kgaughan · kgaughan · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/src/uwhoisd/scraper.py b/src/uwhoisd/scraper.py
@@ -40,7 +40,7 @@ def fetch_ipv4_assignments(url: str):
             yield prefix, whois
 
 
-def fetch(session: requests.Session, url: str):
+def fetch(session: requests.Session, url: str) -> BeautifulSoup:
     """
     Fetch a URL and parse it with Beautiful Soup for scraping.
     """
@@ -55,7 +55,7 @@ def munge_zone(zone: str) -> str:
     return zone.strip("\u200e\u200f.").encode("idna").decode().lower()
 
 
-def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]):
+def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]) -> t.Iterator[t.Tuple[str, str]]:
     """
     Scrape IANA's root zone database for WHOIS servers.
     """
@@ -64,46 +64,52 @@ def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str])
     logging.info("Scraping %s", root_zone_db_url)
     body = fetch(session, root_zone_db_url)
 
-    for link in body.select("#tld-table .tld a"):
-        if "href" not in link.attrs or link.string is None:
-            continue
-
-        zone = munge_zone(link.string)
+    for zone, zone_url in extract_zone_urls(root_zone_db_url, body):
         # If we've already scraped this TLD, ignore it.
         if zone in existing:
             yield (zone, existing[zone])
             continue
 
-        # Is this a zone we should skip/ignore?
-        row = link.parent.parent.parent.findChildren("td")
-        if row[1].string == "test":
-            continue
-        if row[2].string in ("Not assigned", "Retired"):
-            continue
-
-        zone_url = urljoin(root_zone_db_url, link.attrs["href"])
         logging.info("Scraping %s", zone_url)
         body = fetch(session, zone_url)
-
-        whois_server_label = body.find("b", string="WHOIS Server:")
-        whois_server = ""
-        if whois_server_label is not None:
-            whois_server = whois_server_label.next_sibling.strip().lower()
-
+        whois_server = extract_whois_server(body)
         # Fallback to trying whois.nic.*
-        if whois_server == "":
+        if whois_server is None:
             whois_server = f"whois.nic.{zone}"
             logging.info("Trying fallback server: %s", whois_server)
             try:
                 socket.gethostbyname(whois_server)
             except socket.gaierror:
-                whois_server = ""
+                logging.info("No WHOIS server found for %s", zone)
+                continue
+
+        logging.info("WHOIS server for %s is %s", zone, whois_server)
+        yield (zone, whois_server)
+
+
+def extract_zone_urls(base_url: str, body: BeautifulSoup) -> t.Iterator[t.Tuple[str, str]]:
+    for link in body.select("#tld-table .tld a"):
+        if "href" not in link.attrs or link.string is None:  # pragma: no cover
+            continue
+        row = link.find_parent("tr")
+        if row is None:  # pragma: no cover
+            continue
+        tds = row.find_all("td")
+        # Is this a zone we should skip/ignore?
+        if tds[1].string == "test":
+            continue
+        if tds[2].string in ("Not assigned", "Retired"):
+            continue
+
+        yield (munge_zone(link.string), urljoin(base_url, link.attrs["href"]))
+
 
-        if whois_server == "":
-            logging.info("No WHOIS server found for %s", zone)
-        else:
-            logging.info("WHOIS server for %s is %s", zone, whois_server)
-            yield (zone, whois_server)
+def extract_whois_server(body: BeautifulSoup) -> t.Optional[str]:
+    whois_server_label = body.find("b", string="WHOIS Server:")
+    if whois_server_label is None or whois_server_label.next_sibling is None:
+        return None
+    server = whois_server_label.next_sibling.text.strip().lower()
+    return None if server == "" else server
 
 
 def make_arg_parser() -> argparse.ArgumentParser:

diff --git a/tests/iana-root-zone.html b/tests/iana-root-zone.html
@@ -0,0 +1,69 @@
+<!DOCTYPE html>
+<html>
+    <body>
+        <div id="body">
+            <article class="hemmed sidenav">
+                <main>
+                    <div class="iana-table-frame">
+                        <table id="tld-table" class="iana-table">
+                            <thead>
+                                <tr>
+                                    <th>Domain</th>
+                                    <th>Type</th>
+                                    <th>TLD Manager</th>
+                                </tr>
+                            </thead>
+                            <tbody>
+                                <tr>
+                                    <td><span
+                                            class="domain tld"><a>.broken</a></span></td>
+                                    <td>broken</td>
+                                    <td>This is just here for coverage</td>
+                                </tr>
+                                <tr>
+                                    <td><span class="domain tld"><a
+                                                href="/domains/root/db/aaa.html">.aaa</a></span></td>
+                                    <td>generic</td>
+                                    <td>American Automobile Association,
+                                        Inc.</td>
+                                </tr>
+                                <tr>
+                                    <td><span class="domain tld"><a
+                                                href="/domains/root/db/bt.html">.bt</a></span></td>
+                                    <td>country-code</td>
+                                    <td>Ministry of Information and
+                                        Communications</td>
+                                </tr>
+                                <tr>
+                                    <td><span class="domain tld"><a
+                                                href="/domains/root/db/dummy.html">.dummy</a></span></td>
+                                    <td>generic</td>
+                                    <td>Not assigned</td>
+                                </tr>
+                                <tr>
+                                    <td><span class="domain tld"><a
+                                                href="/domains/root/db/silly.html">.silly</a></span></td>
+                                    <td>generic</td>
+                                    <td>Retired</td>
+                                </tr>
+
+                                <tr>
+                                    <td><span class="domain tld"><a
+                                                href="/domains/root/db/xn--9t4b11yi5a.html">.테스트</a></span></td>
+                                    <td>test</td>
+                                    <td>Not assigned</td>
+                                </tr>
+                                <tr>
+                                    <td><span class="domain tld"><a
+                                                href="/domains/root/db/xxx.html">.xxx</a></span></td>
+                                    <td>sponsored</td>
+                                    <td>ICM Registry LLC</td>
+                                </tr>
+                            </tbody>
+                        </table>
+                    </div>
+                </main>
+            </article>
+        </div>
+    </body>
+</html>
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -0,0 +1,46 @@
+from os import path
+
+import bs4
+import pytest
+
+from uwhoisd import scraper
+
+HERE = path.dirname(__file__)
+
+
+def test_extract_zone_urls():
+    with open(path.join(path.dirname(__file__), "iana-root-zone.html"), encoding="utf-8") as fh:
+        body = bs4.BeautifulSoup(fh, "html.parser")
+    result = list(scraper.extract_zone_urls("http://example.com", body))
+    # The test zone should not appear
+    assert result == [
+        ("aaa", "http://example.com/domains/root/db/aaa.html"),
+        ("bt", "http://example.com/domains/root/db/bt.html"),
+        ("xxx", "http://example.com/domains/root/db/xxx.html"),
+    ]
+
+
+def test_extract_zone_urls_edge_cases():
+    empty_body = bs4.BeautifulSoup("", "html.parser")
+    assert list(scraper.extract_zone_urls("http://example.com", empty_body)) == []
+
+
+def test_extract_whois_server():
+    with open(path.join(path.dirname(__file__), "zone-info-fragment.html"), encoding="utf-8") as fh:
+        body = bs4.BeautifulSoup(fh, "html.parser")
+    result = scraper.extract_whois_server(body)
+    assert result == "whois.nic.abc"
+
+
+@pytest.mark.parametrize(
+    "fragment",
+    [
+        "<html><body></body></html>",
+        "<html><body><b>WHOIS Server:</b> </body></html>",
+        "<html><body><b>WHOIS Server:</b></body></html>",
+    ],
+)
+def test_extract_whois_server_no_matches(fragment):
+    body = bs4.BeautifulSoup(fragment, "html.parser")
+    result = scraper.extract_whois_server(body)
+    assert result is None
diff --git a/tests/zone-info-fragment.html b/tests/zone-info-fragment.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+    <body>
+        <h2>Registry Information</h2>
+        <p>
+            <b>URL for registration services:</b> <a
+                href="http://abc.com">http://abc.com</a><br>
+            <b>WHOIS Server:</b> whois.nic.abc
+        </p>
+    </body>
+</html>