8000 Scraper parsing tests by kgaughan · Pull Request #41 · kgaughan/uwhoisd · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Scraper parsing tests #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 34 additions & 28 deletions src/uwhoisd/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def fetch_ipv4_assignments(url: str):
yield prefix, whois


def fetch(session: requests.Session, url: str):
def fetch(session: requests.Session, url: str) -> BeautifulSoup:
"""
Fetch a URL and parse it with Beautiful Soup for scraping.
"""
Expand All @@ -55,7 +55,7 @@ def munge_zone(zone: str) -> str:
return zone.strip("\u200e\u200f.").encode("idna").decode().lower()


def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]):
def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str]) -> t.Iterator[t.Tuple[str, str]]:
"""
Scrape IANA's root zone database for WHOIS servers.
"""
Expand All @@ -64,46 +64,52 @@ def scrape_whois_from_iana(root_zone_db_url: str, existing: t.Mapping[str, str])
logging.info("Scraping %s", root_zone_db_url)
body = fetch(session, root_zone_db_url)

for link in body.select("#tld-table .tld a"):
if "href" not in link.attrs or link.string is None:
continue

zone = munge_zone(link.string)
for zone, zone_url in extract_zone_urls(root_zone_db_url, body):
# If we've already scraped this TLD, ignore it.
if zone in existing:
yield (zone, existing[zone])
continue

# Is this a zone we should skip/ignore?
row = link.parent.parent.parent.findChildren("td")
if row[1].string == "test":
continue
if row[2].string in ("Not assigned", "Retired"):
continue

zone_url = urljoin(root_zone_db_url, link.attrs["href"])
logging.info("Scraping %s", zone_url)
body = fetch(session, zone_url)

whois_server_label = body.find("b", string="WHOIS Server:")
whois_server = ""
if whois_server_label is not None:
whois_server = whois_server_label.next_sibling.strip().lower()

whois_server = extract_whois_server(body)
# Fallback to trying whois.nic.*
if whois_server == "":
if whois_server is None:
whois_server = f"whois.nic.{zone}"
logging.info("Trying fallback server: %s", whois_server)
try:
socket.gethostbyname(whois_server)
except socket.gaierror:
whois_server = ""
logging.info("No WHOIS server found for %s", zone)
continue

logging.info("WHOIS server for %s is %s", zone, whois_server)
yield (zone, whois_server)


def extract_zone_urls(base_url: str, body: BeautifulSoup) -> t.Iterator[t.Tuple[str, str]]:
for link in body.select("#tld-table .tld a"):
if "href" not in link.attrs or link.string is None: # pragma: no cover
continue
row = link.find_parent("tr")
if row is None: # pragma: no cover
continue
tds = row.find_all("td")
# Is this a zone we should skip/ignore?
if tds[1].string == "test":
continue
if tds[2].string in ("Not assigned", "Retired"):
continue

yield (munge_zone(link.string), urljoin(base_url, link.attrs["href"]))


if whois_server == "":
logging.info("No WHOIS server found for %s", zone)
else:
logging.info("WHOIS server for %s is %s", zone, whois_server)
yield (zone, whois_server)
def extract_whois_server(body: BeautifulSoup) -> t.Optional[str]:
whois_server_label = body.find("b", string="WHOIS Server:")
if whois_server_label is None or whois_server_label.next_sibling is None:
return None
server = whois_server_label.next_sibling.text.strip().lower()
return None if server == "" else server


def make_arg_parser() -> argparse.ArgumentParser:
Expand Down
69 changes: 69 additions & 0 deletions tests/iana-root-zone.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<!DOCTYPE html>
<html>
<body>
<div id="body">
<article class="hemmed sidenav">
<main>
<div class="iana-table-frame">
<table id="tld-table" class="iana-table">
<thead>
<tr>
<th>Domain</th>
<th>Type</th>
<th>TLD Manager</th>
</tr>
</thead>
<tbody>
<tr>
<td><span
class="domain tld"><a>.broken</a></span></td>
<td>broken</td>
<td>This is just here for coverage</td>
</tr>
<tr>
<td><span class="domain tld"><a
href="/domains/root/db/aaa.html">.aaa</a></span></td>
<td>generic</td>
<td>American Automobile Association,
Inc.</td>
</tr>
<tr>
<td><span class="domain tld"><a
href="/domains/root/db/bt.html">.bt</a></span></td>
<td>country-code</td>
<td>Ministry of Information and
Communications</td>
</tr>
<tr>
<td><span class="domain tld"><a
href="/domains/root/db/dummy.html">.dummy</a></span></td>
<td>generic</td>
<td>Not assigned</td>
</tr>
<tr>
<td><span class="domain tld"><a
href="/domains/root/db/silly.html">.silly</a></span></td>
<td>generic</td>
<td>Retired</td>
</tr>

<tr>
<td><span class="domain tld"><a
href="/domains/root/db/xn--9t4b11yi5a.html">.테스트</a></span></td>
<td>test</td>
<td>Not assigned</td>
</tr>
<tr>
<td><span class="domain tld"><a
href="/domains/root/db/xxx.html">.xxx</a></span></td>
<td>sponsored</td>
<td>ICM Registry LLC</td>
</tr>
</tbody>
</table>
</div>
</main>
</article>
</div>
</body>
</html>
46 changes: 46 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from os import path

import bs4
import pytest

from uwhoisd import scraper

HERE = path.dirname(__file__)


def test_extract_zone_urls():
with open(path.join(path.dirname(__file__), "iana-root-zone.html"), encoding="utf-8") as fh:
body = bs4.BeautifulSoup(fh, "html.parser")
result = list(scraper.extract_zone_urls("http://example.com", body))
# The test zone should not appear
assert result == [
("aaa", "http://example.com/domains/root/db/aaa.html"),
("bt", "http://example.com/domains/root/db/bt.html"),
("xxx", "http://example.com/domains/root/db/xxx.html"),
]


def test_extract_zone_urls_edge_cases():
empty_body = bs4.BeautifulSoup("", "html.parser")
assert list(scraper.extract_zone_urls("http://example.com", empty_body)) == []


def test_extract_whois_server():
with open(path.join(path.dirname(__file__), "zone-info-fragment.html"), encoding="utf-8") as fh:
body = bs4.BeautifulSoup(fh, "html.parser")
result = scraper.extract_whois_server(body)
assert result == "whois.nic.abc"


@pytest.mark.parametrize(
"fragment",
[
"<html><body></body></html>",
"<html><body><b>WHOIS Server:</b> </body></html>",
"<html><body><b>WHOIS Server:</b></body></html>",
],
)
def test_extract_whois_server_no_matches(fragment):
body = bs4.BeautifulSoup(fragment, "html.parser")
result = scraper.extract_whois_server(body)
assert result is None
11 changes: 11 additions & 0 deletions tests/zone-info-fragment.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<body>
<h2>Registry Information</h2>
<p>
<b>URL for registration services:</b> <a
href="http://abc.com">http://abc.com</a><br>
<b>WHOIS Server:</b> whois.nic.abc
</p>
</body>
</html>
0