consists of DVCSteps to embedd files and save them as for example as csv.

DropDuplicationStep

Bases: TypedStep[DropStettings, list[MarkdownDataContract], list[MarkdownDataContract]]

SimpleSplitterStep to split Markdown Documents rundimentory in medium size chunks.

Source code in wurzel/steps/duplication.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class DropDuplicationStep(TypedStep[DropStettings, list[MarkdownDataContract], list[MarkdownDataContract]]):
    """SimpleSplitterStep to split Markdown Documents rundimentory in medium size chunks."""

    def run(self, inpt: list[MarkdownDataContract]) -> list[MarkdownDataContract]:
        """Executes the split step by processing input markdown files, generating smaller splitted markdown files,
        by preserving the headline.
        """
        if self.settings.DROP_BY_FIELDS == ["*"]:
            self.settings.DROP_BY_FIELDS = None
        df = pd.DataFrame(i.model_dump() for i in inpt)
        if not df.duplicated(self.settings.DROP_BY_FIELDS).any():
            return inpt

        filtered = df.drop_duplicates(self.settings.DROP_BY_FIELDS)
        log.warning(
            "Removed duplicates",
            extra={
                "percentage": len(filtered) / len(df),
                "before": len(df),
                "after": len(filtered),
                "by": str(self.settings.DROP_BY_FIELDS),
            },
        )
        dumped = filtered.to_dict(orient="records")
        return [MarkdownDataContract.model_construct(**f) for f in dumped]

run(inpt)

Executes the split step by processing input markdown files, generating smaller splitted markdown files, by preserving the headline.

Source code in wurzel/steps/duplication.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def run(self, inpt: list[MarkdownDataContract]) -> list[MarkdownDataContract]:
    """Executes the split step by processing input markdown files, generating smaller splitted markdown files,
    by preserving the headline.
    """
    if self.settings.DROP_BY_FIELDS == ["*"]:
        self.settings.DROP_BY_FIELDS = None
    df = pd.DataFrame(i.model_dump() for i in inpt)
    if not df.duplicated(self.settings.DROP_BY_FIELDS).any():
        return inpt

    filtered = df.drop_duplicates(self.settings.DROP_BY_FIELDS)
    log.warning(
        "Removed duplicates",
        extra={
            "percentage": len(filtered) / len(df),
            "before": len(df),
            "after": len(filtered),
            "by": str(self.settings.DROP_BY_FIELDS),
        },
    )
    dumped = filtered.to_dict(orient="records")
    return [MarkdownDataContract.model_construct(**f) for f in dumped]

DropStettings

Bases: Settings

specify DROP_BY_FIELDS to field.

Source code in wurzel/steps/duplication.py
21
22
23
24
class DropStettings(Settings):
    """specify DROP_BY_FIELDS to field."""

    DROP_BY_FIELDS: list[str] = ["md"]