Skip to content

Commit

Permalink
Added Support for JQ syntax in include/exclude mixer config (#131)
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni authored Mar 1, 2024
1 parent 5b8109d commit cd7d983
Show file tree
Hide file tree
Showing 7 changed files with 497 additions and 10 deletions.
113 changes: 113 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ tokio-util = "0.7.7"
unicode-segmentation = "1.7"
openssl = { version = "0.10.63", features = ["vendored"] }
adblock = { version = "0.8.6", features = ["content-blocking"] }
jaq-core = "1.2.1"
jaq-std = "1.2.1"
jaq-parse = "1.0.2"
jaq-interpret = { version = "1.2.1", features = ["serde_json"] }

# [target.'cfg(target_arch = "aarch64")'.dependencies]
# openssl = { version = "0.10.63", features = ["vendored"] }
Expand Down
16 changes: 16 additions & 0 deletions python/dolma/cli/mixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,21 @@ class StreamOutputConfig:
class FilterConfig:
include: List[str] = field(default=[], help="JSONPath expressions to include documents")
exclude: List[str] = field(default=[], help="JSONPath expressions to exclude documents")
syntax: str = field(
default="jsonpath",
help="Syntax to use for filter expressions. Can be either JSONPath or jq. Defaults to JSONPath.",
)


@dataclass
class SpanReplacementConfig:
span: str = field(help="JSONPath expression for the span to replace")
min_score: float = field(default=0.5, help="Minimum score for the span to be replaced")
replacement: str = field(default="", help="Replacement for the span")
syntax: str = field(
default="jsonpath",
help="Syntax to use for filter expressions. Currently only JSONPath is supported. Defaults to JSONPath.",
)


@dataclass
Expand Down Expand Up @@ -79,12 +87,20 @@ def run(cls, parsed_config: MixerConfig):
if not stream_config.filter.include and not stream_config.filter.exclude:
raise DolmaConfigError("Either `include` or `exclude` must be specified for filter")

if stream_config.filter.syntax not in ["jsonpath", "jq"]:
raise DolmaConfigError("Invalid filter syntax; must be either 'jsonpath' or 'jq'")

stream_config_dict["filter"] = {
"include": [str(i) for i in stream_config.filter.include],
"exclude": [str(i) for i in stream_config.filter.exclude],
"syntax": stream_config.filter.syntax,
}

for span_replacement in stream_config.span_replacement:
if span_replacement.syntax not in ["jsonpath"]:
raise DolmaConfigError("Invalid span_replacement syntax; must be 'jsonpath'")

# TODO: note that we are not using the syntax here yet; adding it later
stream_config_dict.setdefault("span_replacement", []).append(
{
"span": str(span_replacement.span),
Expand Down
26 changes: 26 additions & 0 deletions scripts/sample_prefix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import random
from typing import TYPE_CHECKING
from dolma.core.paths import glob_path

import necessary

with necessary.necessary("click") as CLICK_AVAILABLE:
if CLICK_AVAILABLE or TYPE_CHECKING:
import click


@click.command()
@click.option("--prefix")
@click.option("--ratio", type=float)
@click.option("--seed", type=int, default=0)
def main(prefix: str, ratio: float, seed: int):
assert 0 < ratio < 1
random.seed(seed)

for path in glob_path(prefix):
if random.random() < ratio:
print(path)


if __name__ == "__main__":
main()
Loading

0 comments on commit cd7d983

Please sign in to comment.