Note: Known Limitations with EasyOCR (EasyOcrOptions
).
- Table structure is often lost or misaligned in the OCR output.
- Spelling inaccuracies are occasionally observed (e.g., "Verlängern" → "Verlängenng").
- URLs are not parsed correctly (e.g., "www.telekom.de/agb" → "www telekom delagb").
While investigating EasyOCR issues and testing alternative OCR engines,
we observed that some documents produced distorted text with irregular whitespace.
This disrupts the natural sentence flow and significantly reduces readability.
Example:
"pra kti sche i nform ati o nen zu i h rer fam i l y card basi c Li eber
Tel ekom Kunde, schön, dass Si e si ch f ür..."
Despite these limitations, we have decided to proceed with EasyOCR.
CleanMarkdownRenderer
Bases: HTMLRenderer
Custom Markdown renderer extending mistletoe's HTMLRenderer to clean up
unwanted elements from Markdown input.
Source code in wurzel/steps/docling/docling_step.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60 | class CleanMarkdownRenderer(HTMLRenderer):
"""Custom Markdown renderer extending mistletoe's HTMLRenderer to clean up
unwanted elements from Markdown input.
"""
@staticmethod
def render_html_block(token):
"""Render HTML block tokens and clean up unwanted elements.
This method removes HTML comments and returns the cleaned HTML content.
Remove comments like <!-- image -->
"""
soup = BeautifulSoup(token.content, "html.parser")
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
return soup.decode_contents().strip()
|
render_html_block(token)
staticmethod
Render HTML block tokens and clean up unwanted elements.
This method removes HTML comments and returns the cleaned HTML content.
Remove comments like
Source code in wurzel/steps/docling/docling_step.py
49
50
51
52
53
54
55
56
57
58
59
60 | @staticmethod
def render_html_block(token):
"""Render HTML block tokens and clean up unwanted elements.
This method removes HTML comments and returns the cleaned HTML content.
Remove comments like <!-- image -->
"""
soup = BeautifulSoup(token.content, "html.parser")
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
return soup.decode_contents().strip()
|
DoclingStep
Bases: TypedStep[DoclingSettings, None, list[MarkdownDataContract]]
Step to return local Markdown files with enhanced PDF extraction for German.
Source code in wurzel/steps/docling/docling_step.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 | class DoclingStep(TypedStep[DoclingSettings, None, list[MarkdownDataContract]]):
"""Step to return local Markdown files with enhanced PDF extraction for German."""
def __init__(self):
super().__init__()
self.converter = self.create_converter()
def create_converter(self) -> DocumentConverter:
"""Create and configure the document converter for PDF and DOCX.
Returns:
DocumentConverter: Configured document converter.
"""
pipeline_options = PdfPipelineOptions()
ocr_options = EasyOcrOptions()
pipeline_options.ocr_options = ocr_options
return DocumentConverter(
allowed_formats=self.settings.FORMATS,
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
},
)
@staticmethod
def extract_keywords(md_text: str) -> str:
"""Cleans a Markdown string using mistletoe and extracts useful content.
- Parses and renders the Markdown content into HTML using a custom HTML renderer
- Removes unwanted HTML comments and escaped underscores
- Extracts the first heading from the content (e.g., `<h1>` to `<h6>`)
- Converts the cleaned HTML into plain text
Args:
md_text (str): The raw Markdown input string.
"""
with MD_RENDER_LOCK, CleanMarkdownRenderer() as renderer:
ast = MTDocument(md_text)
cleaned = renderer.render(ast).replace("\n", "")
soup = BeautifulSoup(cleaned, "html.parser")
first_heading_tag = soup.find(["h1", "h2", "h3", "h4", "h5", "h6"])
heading = first_heading_tag.get_text(strip=True) if first_heading_tag else ""
return heading
def run(self, inpt: None) -> list[MarkdownDataContract]:
"""Run the document extraction and conversion process for German PDFs.
Args:
inpt (None): Input parameter (not used).
Returns:
List[MarkdownDataContract]: List of converted Markdown contracts.
"""
urls = self.settings.URLS
contracts = []
for url in urls:
try:
converted_contract = self.converter.convert(url)
md = converted_contract.document.export_to_markdown(image_placeholder="")
keyword = self.extract_keywords(md)
contract_instance = {"md": md, "keywords": " ".join([self.settings.DEFAULT_KEYWORD, keyword]), "url": url}
contracts.append(contract_instance)
except (FileNotFoundError, OSError) as e:
log.error(f"Failed to verify URL: {url}. Error: {e}")
continue
return contracts
|
create_converter()
Create and configure the document converter for PDF and DOCX.
Returns: |
-
DocumentConverter ( DocumentConverter
) –
Configured document converter.
|
Source code in wurzel/steps/docling/docling_step.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88 | def create_converter(self) -> DocumentConverter:
"""Create and configure the document converter for PDF and DOCX.
Returns:
DocumentConverter: Configured document converter.
"""
pipeline_options = PdfPipelineOptions()
ocr_options = EasyOcrOptions()
pipeline_options.ocr_options = ocr_options
return DocumentConverter(
allowed_formats=self.settings.FORMATS,
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
},
)
|
Cleans a Markdown string using mistletoe and extracts useful content.
- Parses and renders the Markdown content into HTML using a custom HTML renderer
- Removes unwanted HTML comments and escaped underscores
- Extracts the first heading from the content (e.g.,
<h1>
to <h6>
)
- Converts the cleaned HTML into plain text
Parameters: |
-
md_text
(str )
–
The raw Markdown input string.
|
Source code in wurzel/steps/docling/docling_step.py
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110 | @staticmethod
def extract_keywords(md_text: str) -> str:
"""Cleans a Markdown string using mistletoe and extracts useful content.
- Parses and renders the Markdown content into HTML using a custom HTML renderer
- Removes unwanted HTML comments and escaped underscores
- Extracts the first heading from the content (e.g., `<h1>` to `<h6>`)
- Converts the cleaned HTML into plain text
Args:
md_text (str): The raw Markdown input string.
"""
with MD_RENDER_LOCK, CleanMarkdownRenderer() as renderer:
ast = MTDocument(md_text)
cleaned = renderer.render(ast).replace("\n", "")
soup = BeautifulSoup(cleaned, "html.parser")
first_heading_tag = soup.find(["h1", "h2", "h3", "h4", "h5", "h6"])
heading = first_heading_tag.get_text(strip=True) if first_heading_tag else ""
return heading
|
run(inpt)
Run the document extraction and conversion process for German PDFs.
Parameters: |
-
inpt
(None )
–
Input parameter (not used).
|
Returns: |
-
list[MarkdownDataContract]
–
List[MarkdownDataContract]: List of converted Markdown contracts.
|
Source code in wurzel/steps/docling/docling_step.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 | def run(self, inpt: None) -> list[MarkdownDataContract]:
"""Run the document extraction and conversion process for German PDFs.
Args:
inpt (None): Input parameter (not used).
Returns:
List[MarkdownDataContract]: List of converted Markdown contracts.
"""
urls = self.settings.URLS
contracts = []
for url in urls:
try:
converted_contract = self.converter.convert(url)
md = converted_contract.document.export_to_markdown(image_placeholder="")
keyword = self.extract_keywords(md)
contract_instance = {"md": md, "keywords": " ".join([self.settings.DEFAULT_KEYWORD, keyword]), "url": url}
contracts.append(contract_instance)
except (FileNotFoundError, OSError) as e:
log.error(f"Failed to verify URL: {url}. Error: {e}")
continue
return contracts
|