8000 Refactor CLI scraping: fix trending words, handle dynamic classes, improve outputs, and add request headers by spithash · Pull Request #1 · agmmnn/etym-cli · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Refactor CLI scraping: fix trending words, handle dynamic classes, improve outputs, and add request headers #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 53 additions & 48 deletions etym_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,82 +9,90 @@
from rich import print as rprint


HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:139.0) Gecko/20100101 Firefox/139.0"}


def req(page, word):
url = "https://www.etymonline.com/" + page + quote(word)
r = requests.get(url)
r = requests.get(url, headers=HEADERS)
if r.status_code == 404:
rprint("[italic]Word not found.[/italic]")
exit()
return BeautifulSoup(r.content, "lxml")


# plane text output
# plain text output
def o_plain(secs):
print()
for ix, i in enumerate(secs):
print(i.find(class_="word__name--TTbAA").text)
for ix, sec in enumerate(secs):
title = sec.find(["h1", "h2", "h3"])
if title:
print(title.text.strip())
else:
print("[No title found]")

pb = sec.find_all(["p", "blockquote"])
newline = "\n" if ix != len(secs) - 1 else ""
for j in i.section.find_all(["p", "blockquote"]):
for j in pb:
if j.name == "blockquote":
print(">>" + j.text)
elif j.name == "p" and j.text != "":
print(j.text + newline)
print(">>" + j.text.strip())
elif j.name == "p" and j.text.strip() != "":
print(j.text.strip() + newline)
print()


# rich output
def o_rich(word, secs, related):
table = Table(
title="\n[bright_cyan][link=https://www.etymonline.com/word/{}]{} | Online Etymology Dictionary[/link][/bright_cyan]".format(
word, word
),
title=f"\n[bright_cyan][link=https://www.etymonline.com/word/{word}]{word} | Online Etymology Dictionary[/link][/bright_cyan]",
show_header=False,
box=box.SQUARE,
)
table.add_column()
for ix, i in enumerate(secs):
table.add_row(
"[bright_cyan]" + i.find(class_="word__name--TTbAA").text + "[/bright_cyan]"
)
pb = i.section.find_all(["p", "blockquote"])
for ix, sec in enumerate(secs):
title = sec.find(["h1", "h2", "h3"])
if title:
table.add_row(f"[bright_cyan]{title.text.strip()}[/bright_cyan]")
else:
table.add_row("[bright_cyan][No title found][/bright_cyan]")

pb = sec.find_all(["p", "blockquote"])
for jx, j in enumerate(pb):
newline = "" if ix == len(secs) - 1 and jx == len(pb) - 1 else "\n"
# print(ix, jx, len(secs) - 1, len(pb) - 1)
if j.name == "blockquote":
table.add_row(
">[italic grey82]" + j.text + "[/italic grey82]" + newline
)
elif j.name == "p" and j.text != "":
table.add_row(j.text + newline)
if related != None:
table.add_row(
"\n[bright_cyan]Entries related to [italic]"
+ word
+ "[/italic]:[/bright_cyan]"
)
related.find_all("dfg", {"class": None})
r_lst = []
for i in related:
r_lst.append(i.text)
table.add_row("[wheat4]" + ", ".join(r_lst) + "[/wheat4]")
table.add_row(f">[italic grey82]{j.text.strip()}[/italic grey82]{newline}")
elif j.name == "p" and j.text.strip() != "":
table.add_row(j.text.strip() + newline)

if related:
related_items = [li.text.strip() for li in related.find_all("li")]
if related_items:
table.add_row(f"\n[bright_cyan]Entries related to [italic]{word}[/italic]:[/bright_cyan]")
table.add_row("[wheat4]" + ", ".join(related_items) + "[/wheat4]")

rprint(table)


# returns trending words
# UPDATED trending words function
def o_trend():
soup = req("search?q=", "z")
t_lst = [
i.text for i in soup.find("div", {"trending__normal--2eWJF"}).find_all("li")
]
# Fetch a known word page (e.g., 'test') because trending sidebar is there
soup = req("word/", "test")
ul = soup.find("ul", class_="list-none list-none grid grid-cols-1")
if not ul:
rprint("[italic red]Could not find trending words list on the page.[/italic red]")
return
words = [a.get_text().strip() for a in ul.find_all("a", title=True)]
rprint("[bright_cyan]Trending Words:[/bright_cyan]")
rprint(", ".join(t_lst))
rprint(", ".join(words))


# returns fuzzy search results
# fuzzy search
def o_fuzzy(word):
soup = req("api/etymology/fuzzy?key=", word)
j = json.loads(soup.text)
r = requests.get(f"https://www.etymonline.com/api/etymology/fuzzy?key={quote(word)}", headers=HEADERS)
if r.status_code != 200:
rprint("[italic red]Fuzzy search failed.[/italic red]")
return
j = json.loads(r.text)
rprint("[wheat4]" + ", ".join(j) + "[/wheat4]")


Expand All @@ -95,12 +103,9 @@ def main(word, p, t, f):
o_fuzzy(word)
else:
soup = req("word/", word)
# find all word sections
secs = soup.find_all("div", {"class": "word--C9UPa"})
# find related word section
related = soup.find("ul", {"related__container--22iKI"})
# get output
if p == True:
secs = soup.find_all("section")
related = soup.find("ul", class_="related__container--22iKI")
if p:
o_plain(secs)
else:
o_rich(word, secs, related)
0