Source code for mltb2.bs
# Copyright (c) 2023 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT
"""Beautiful Soup and HTML specific tools.
Hint:
Use pip to install the necessary dependencies for this module:
``pip install mltb2[bs]``
"""
from typing import Any, Optional
import mdformat
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter, markdownify
[docs]
def extract_text(soup: BeautifulSoup, join_str: Optional[str] = None) -> str:
"""Extract the text from a BeautifulSoup object.
Warning:
This implementation has known issues with whitespace handling.
Args:
soup: The BeautifulSoup object to extract the text from.
join_str: String to join the text parts with. Per default a space is used.
Returns:
Text from the BeautifulSoup object.
"""
if join_str is None:
join_str = " "
texts = list(soup.stripped_strings)
result: str = join_str.join(texts)
return result
[docs]
def extract_one(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any:
"""Extract exactly one specified element from a BeautifulSoup object.
This function expacts that exactly only one result is found.
Otherwise a RuntimeError is raised.
Args:
soup: The BeautifulSoup object to extract the element from.
name: Name of the tag to extract.
attrs: Attributes of the tag to extract.
kwargs: Additional keyword arguments.
Returns:
The extracted BeautifulSoup element.
Raises:
RuntimeError: If not exactly one result is found.
"""
if attrs is None:
attrs = {}
result = soup.find_all(name, attrs, **kwargs)
if len(result) != 1:
raise RuntimeError(f"Expected exactly one result, but got {len(result)}!")
result = result[0]
return result
[docs]
def extract_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> Any:
"""Extract all specified elements from a BeautifulSoup object.
Args:
soup: The BeautifulSoup object to extract the elements from.
name: Name of the tag to extract.
attrs: Attributes of the tag to extract.
kwargs: Additional keyword arguments.
Returns:
The extracted BeautifulSoup elements.
"""
if attrs is None:
attrs = {}
result = soup.find_all(name, attrs, **kwargs)
return result
[docs]
def remove_all(soup: BeautifulSoup, name=None, attrs: Optional[dict] = None, **kwargs: dict[str, Any]) -> None:
"""Remove all specified elements from a BeautifulSoup object.
The removal is done in place. Nothing is returned.
Args:
soup: The BeautifulSoup object to remove the elements from.
name: Name of the tag(-s) to remove.
attrs: Attributes of the tag(-s) to remove.
kwargs: Additional keyword arguments.
"""
if attrs is None:
attrs = {}
result = soup.find_all(name, attrs, **kwargs)
for r in result:
r.decompose()
[docs]
def soup_to_md(soup: BeautifulSoup, mdformat_options: Optional[dict] = None) -> str:
"""Convert a BeautifulSoup object to Markdown.
The default mdformat options are:
- ``number=True``: apply consecutive numbering to ordered lists
- ``wrap="no"``: paragraph word wrap mode
- ``end-of-line="lf"``: use LF as line ending
See Also:
The `mdformat Options
<https://mdformat.readthedocs.io/en/stable/users/installation_and_usage.html#options>`_.
Args:
soup: BeautifulSoup object.
mdformat_options: Options for mdformat.
Returns:
The Markdown text.
"""
if mdformat_options is None:
mdformat_options = {"number": True, "wrap": "no"}
text = MarkdownConverter().convert_soup(soup)
text = mdformat.text(text, options=mdformat_options)
return text
[docs]
def html_to_md(html: str, mdformat_options: Optional[dict] = None) -> str:
"""Convert HTML to Markdown.
The default mdformat options are:
- ``number=True``: apply consecutive numbering to ordered lists
- ``wrap="no"``: paragraph word wrap mode
- ``end-of-line="lf"``: use LF as line ending
See Also:
The `mdformat Options
<https://mdformat.readthedocs.io/en/stable/users/installation_and_usage.html#options>`_.
Args:
html: HTML text.
mdformat_options: Options for mdformat.
Returns:
The Markdown text.
"""
if mdformat_options is None:
mdformat_options = {"number": True, "wrap": "no"}
text = markdownify(html)
text = mdformat.text(text, options=mdformat_options)
return text