Bases: TypedStep[DropStettings, list[MarkdownDataContract], list[MarkdownDataContract]]
SimpleSplitterStep to split Markdown Documents rundimentory in medium size chunks.
Source code in wurzel/steps/duplication.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 | class DropDuplicationStep(TypedStep[DropStettings, list[MarkdownDataContract], list[MarkdownDataContract]]):
"""SimpleSplitterStep to split Markdown Documents rundimentory in medium size chunks."""
def run(self, inpt: list[MarkdownDataContract]) -> list[MarkdownDataContract]:
"""Executes the split step by processing input markdown files, generating smaller splitted markdown files,
by preserving the headline.
"""
if self.settings.DROP_BY_FIELDS == ["*"]:
self.settings.DROP_BY_FIELDS = None
df = pd.DataFrame(i.model_dump() for i in inpt)
if not df.duplicated(self.settings.DROP_BY_FIELDS).any():
return inpt
filtered = df.drop_duplicates(self.settings.DROP_BY_FIELDS)
log.warning(
"Removed duplicates",
extra={
"percentage": len(filtered) / len(df),
"before": len(df),
"after": len(filtered),
"by": str(self.settings.DROP_BY_FIELDS),
},
)
dumped = filtered.to_dict(orient="records")
return [MarkdownDataContract.model_construct(**f) for f in dumped]
|
run(inpt)
Executes the split step by processing input markdown files, generating smaller splitted markdown files,
by preserving the headline.
Source code in wurzel/steps/duplication.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 | def run(self, inpt: list[MarkdownDataContract]) -> list[MarkdownDataContract]:
"""Executes the split step by processing input markdown files, generating smaller splitted markdown files,
by preserving the headline.
"""
if self.settings.DROP_BY_FIELDS == ["*"]:
self.settings.DROP_BY_FIELDS = None
df = pd.DataFrame(i.model_dump() for i in inpt)
if not df.duplicated(self.settings.DROP_BY_FIELDS).any():
return inpt
filtered = df.drop_duplicates(self.settings.DROP_BY_FIELDS)
log.warning(
"Removed duplicates",
extra={
"percentage": len(filtered) / len(df),
"before": len(df),
"after": len(filtered),
"by": str(self.settings.DROP_BY_FIELDS),
},
)
dumped = filtered.to_dict(orient="records")
return [MarkdownDataContract.model_construct(**f) for f in dumped]
|