# Copyright (c) 2023-2024 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT
"""This module offers `SoMaJo <https://github.com/tsproisl/SoMaJo>`_ specific tools.
Hint:
Use pip to install the necessary dependencies for this module:
``pip install mltb2[somajo]``
"""
from abc import ABC
from collections.abc import Iterable
from dataclasses import dataclass, field
from typing import Literal, Optional, Union
from somajo import SoMaJo
from tqdm import tqdm
[docs]
@dataclass
class SoMaJoBaseClass(ABC):
"""Base Class for SoMaJo tools.
Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
Note:
This class is an abstract base class. It should not be used directly.
"""
language: Literal["de_CMC", "en_PTB"]
somajo: SoMaJo = field(init=False, repr=False)
def __post_init__(self):
"""Do post init."""
self.somajo = SoMaJo(self.language)
[docs]
def detokenize(tokens) -> str:
"""Convert SoMaJo tokens to sentence (string).
Args:
tokens: The tokens to be de-tokenized.
Returns:
The de-tokenized sentence.
See Also:
`How do I split sentences but not words? <https://github.com/tsproisl/SoMaJo/issues/17>`_
"""
result_list = []
for token in tokens:
if token.original_spelling is not None:
result_list.append(token.original_spelling)
else:
result_list.append(token.text)
if token.space_after:
result_list.append(" ")
result = "".join(result_list)
result = result.strip()
return result
[docs]
@dataclass
class SoMaJoSentenceSplitter(SoMaJoBaseClass):
"""Use SoMaJo to split text into sentences.
Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
show_progress_bar: Show a progressbar during processing.
"""
show_progress_bar: bool = False
[docs]
def __call__(self, text: str) -> list[str]:
"""Split the text into a list of sentences.
Args:
text: The text to be split.
Returns:
The list of sentence splits.
"""
sentences = self.somajo.tokenize_text([text])
result = []
for sentence in tqdm(sentences, disable=not self.show_progress_bar):
sentence_string = detokenize(sentence)
result.append(sentence_string)
return result
[docs]
@dataclass
class JaccardSimilarity(SoMaJoBaseClass):
"""Calculate the `jaccard similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_.
Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
"""
[docs]
def get_token_set(self, text: str) -> set[str]:
"""Get token set for text.
Args:
text: The text to be tokenized into a set.
Returns:
The set of tokens (words).
"""
sentences = self.somajo.tokenize_text([text])
token_set = extract_token_class_set(sentences) # TODO: filter tokens
token_set = {t.lower() for t in token_set}
return token_set
[docs]
def __call__(self, text1: str, text2: str) -> float:
"""Calculate the jaccard similarity for two texts.
Args:
text1: Text one.
text2: Text two.
Returns:
The jaccard similarity.
"""
token_set1 = self.get_token_set(text1)
token_set2 = self.get_token_set(text2)
intersection = token_set1.intersection(token_set2)
union = token_set1.union(token_set2)
jaccard_similarity = float(len(intersection)) / len(union)
return jaccard_similarity
[docs]
@dataclass
class UrlSwapper:
"""Tool to swap (and reverse swap) links with a numbered replacement link.
Args:
token_extractor: The sentence token extractor to be used.
url_pattern: The pattern to use for replacement. One ``{}`` marks the place where to put the number.
"""
token_extractor: TokenExtractor
url_pattern: str = "https://link-{}.com"
_url_map: dict[str, str] = field(init=False, repr=False) # map from real url to swapped url
def __post_init__(self):
"""Do post init."""
self._url_map = {}
[docs]
def swap_urls(self, text: str) -> str:
"""Swap the urls of the text."""
url_set = self.token_extractor.extract_url_set(text)
for url in url_set:
if url not in self._url_map: # if url is unknown: add it
self._url_map[url] = self.url_pattern.format(len(self._url_map) + 1)
text = text.replace(url, self._url_map[url]) # replace
return text
[docs]
def reverse_swap_urls(self, text: str) -> tuple[str, set[str]]:
"""Revert the url swap.
Returns:
The reverted text and a ``set`` of URLs that were unknown by the ``URLSwapper``.
"""
reverse_url_map = {v: k for k, v in self._url_map.items()} # map from swapped url to real url
url_set = self.token_extractor.extract_url_set(text)
no_reverse_swap_urls = set()
for url in url_set:
if url in reverse_url_map:
text = text.replace(url, reverse_url_map[url]) # replace
else:
no_reverse_swap_urls.add(url)
return text, no_reverse_swap_urls