- 2024/11/25: Project Initialization
llm-web-kit is a python library that ..
- Remove headers, footers, footnotes, page numbers, etc., to ensure semantic coherence.
- Output text in human-readable order, suitable for single-column, multi-column, and complex layouts.
from llm_web_kit.simple import extract_html_to_md
import traceback
from loguru import logger
def extract(url:str, html:str) -> str:
try:
nlp_md = extract_html_to_md(url, html)
# or mm_nlp_md = extract_html_to_mm_md(url, html)
return nlp_md
except Exception as e:
logger.exception(e)
return None
if __name__=="__main__":
url = ""
html = ""
markdown = extract(url, html)