Skip to content

Commit

Permalink
Merge pull request #11 from mdmmn378/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
mdmmn378 authored Dec 16, 2023
2 parents 93f53da + 6ee982c commit 51e44cd
Show file tree
Hide file tree
Showing 14 changed files with 189 additions and 52 deletions.
54 changes: 27 additions & 27 deletions .github/workflows/build-publish.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Python
name: CI

on:
push:
Expand All @@ -8,7 +8,7 @@ on:

jobs:
build_and_test:
name: Test
name: Release
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand All @@ -20,31 +20,31 @@ jobs:
command: build
args: --release

# macos:
# runs-on: macos-latest
# strategy:
# matrix:
# python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
# steps:
# - uses: actions/checkout@v3
# - uses: actions/setup-python@v4
# with:
# python-version: ${{ matrix.python-version }}
# architecture: x64
# - uses: dtolnay/rust-toolchain@stable
# - name: Build wheels - universal2
# uses: PyO3/maturin-action@v1
# with:
# args: --release --universal2 --out dist -m Cargo.toml -i ${{ matrix.python-version }}
# - name: Install built wheel - universal2
# run: |
# pip install texy --no-index --find-links dist --force-reinstall
# python -c "import texy"
# - name: Upload wheels
# uses: actions/upload-artifact@v3
# with:
# name: wheels
# path: dist
macos:
runs-on: macos-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- uses: dtolnay/rust-toolchain@stable
- name: Build wheels - universal2
uses: PyO3/maturin-action@v1
with:
args: --release --out dist -m Cargo.toml -i ${{ matrix.python-version }}
- name: Install built wheel - universal2
run: |
pip install texy --no-index --find-links dist --force-reinstall
python -c "import texy"
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
name: wheels
path: dist

linux:
runs-on: ubuntu-latest
Expand Down
11 changes: 5 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "texy-process"
version = "0.0.2-alpha"
version = "0.0.2"
edition = "2021"

[lib]
Expand All @@ -11,14 +11,13 @@ crate-type = ["cdylib", "rlib"]
regex = "1.7.0"
lazy_static = "1.4.0"
serde_json = "1.0.89"
rayon = "1.6.1"
procspawn = "0.10.1"
# rayon = "1.6.1"
procspawn = "1.0.0"

[dependencies.pyo3]
version = "0.19.0"
features = ["abi3-py37"]
version = "0.20.0"
features = ["abi3-py311"]

[features]
extension-module = ["pyo3/extension-module"]
default = ["extension-module"]

15 changes: 12 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.SHELL := /bin/bash
.PHONY: all format lint test test-rs build-dev build-release
.PHONY: all format lint test test-rs build-dev build-release profile type

all: lint format test test-rs
@echo "All done!"
Expand All @@ -23,8 +23,17 @@ test-rs:

build-dev:
@echo "Building dev..."
-maturin develop
-maturin develop --release

build-release:
@echo "Building release..."
-maturin build --release
-maturin build --release

profile:
@echo "Profiling..."
-python -m tests.profiler


type:
@echo "Running type checker (pyright 1.1.340) ..."
pyright src/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

---

[![Python](https://github.com/mdmmn378/texy/actions/workflows/build-publish.yaml/badge.svg)](https://github.com/mdmmn378/texy/actions/workflows/build-publish.yaml)
[![Python](https://github.com/mdmmn378/texy/actions/workflows/build-publish.yaml/badge.svg)](https://github.com/mdmmn378/texy/actions/workflows/build-publish.yaml) ![PyPI - Version](https://img.shields.io/pypi/v/texy)

> A utility library for quickly cleaning texts
Expand Down
10 changes: 10 additions & 0 deletions lefthook.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
pre-commit:
commands:
lint:
run: make lint
format:
run: make format
type-check:
run: make type
update:
run: git update-index --again
8 changes: 3 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@ name = "texy"
description = "Supercharge text processing"
readme = "README.md"
requires-python = ">=3.6"
version = "0.0.2-alpha"
authors = [
{ name="Mamunur Rahaman Mamun", email="[email protected]" },
]
version = "0.0.2"
authors = [{ name = "Mamunur Rahaman Mamun", email = "[email protected]" }]

[project.urls]
homepage = "https://github.com/mdmmn378/texy"
Expand All @@ -20,4 +18,4 @@ build-backend = "maturin"
features = ["pyo3/extension-module"]

[tool.isort]
profile = "black"
profile = "black"
18 changes: 18 additions & 0 deletions pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"include": [
"src"
],

"exclude": [
"**/node_modules",
"**/__pycache__",
],

"ignore": [
"tests/**/*",
],

"defineConstant": {
"DEBUG": true
}
}
3 changes: 1 addition & 2 deletions src/components/actions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ lazy_static! {
r#"[\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0]"#
)
.unwrap();
pub static ref EMOTICONS: Vec<String> = get_emoticons();
pub static ref RE_HTML: Regex = Regex::new(r"<[^>]*>").unwrap();
pub static ref RE_XML: Regex = Regex::new(r"<[/]?[^>]+>").unwrap();
}
Expand Down Expand Up @@ -58,7 +57,7 @@ pub fn remove_emojis(string: String) -> String {

pub fn remove_emoticons(string: String) -> String {
let mut res = string.clone();
for emo in EMOTICONS.iter() {
for emo in get_emoticons().iter() {
res = res.replace(emo.as_str(), " ");
}
return res.to_string();
Expand Down
32 changes: 32 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
pub mod components;
pub mod pipelines;
pub mod utils;
use pipelines::blocks::{extreme, relaxed, strict};

fn leak() {
let data = [
"Hello, this is a sample text with\nnewlines.",
"Visit https://example.com for more info!",
"Send your feedback to [email protected]",
"<p>This is an HTML paragraph.</p>",
"<xml>This is some XML content.</xml>",
"😃 Removing emoticons and emojis 😊 🚀",
"This text has infrequent punctuations: !?#",
"Multiple spaces between words.",
];
let mut v: Vec<String> = Vec::new();
for _ in 0..100 {
for i in &data {
v.push(i.to_string());
}
}
println!("Data size: {}", v.len());
extreme(v.clone());
relaxed(v.clone());
strict(v.clone());
println!("Done!");
}

fn main() {
leak();
}
7 changes: 3 additions & 4 deletions src/pipelines/blocks.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use crate::components::actions::*;
use pyo3::prelude::*;
use rayon::prelude::*;

#[allow(unused_assignments)]
pub fn relaxed(items: Vec<String>) -> Vec<String> {
let result = items
.par_iter()
.iter()
.map(|elem| {
let mut tmp = String::new();
tmp = remove_newlines(elem.to_string());
Expand All @@ -21,7 +20,7 @@ pub fn relaxed(items: Vec<String>) -> Vec<String> {
#[allow(unused_assignments)]
pub fn strict(items: Vec<String>) -> Vec<String> {
let result = items
.par_iter()
.iter()
.map(|elem| {
let mut tmp = String::new();
tmp = remove_newlines(elem.to_string());
Expand All @@ -42,7 +41,7 @@ pub fn strict(items: Vec<String>) -> Vec<String> {
#[allow(unused_assignments)]
pub fn extreme(items: Vec<String>) -> Vec<String> {
let result = items
.par_iter()
.iter()
.map(|elem| {
let mut tmp = String::new();
tmp = remove_newlines(elem.to_string());
Expand Down
56 changes: 56 additions & 0 deletions tests/profiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import copy
import gc

from memory_profiler import profile


def dummy_clean(data):
return copy.deepcopy(data)


@profile
def profile_extreme_clean():
print("Profiling extreme_clean")
from texy.pipelines import extreme_clean

data = [
"Hello, this is a sample text with\nnewlines.",
"Visit https://example.com for more info!",
"Send your feedback to [email protected]",
"<p>This is an HTML paragraph.</p>",
"<xml>This is some XML content.</xml>",
"😃 Removing emoticons and emojis 😊 🚀",
"This text has infrequent punctuations: !?#",
"Multiple spaces between words.",
] * 100000
cleaned_data = extreme_clean(data)

del cleaned_data
del data
gc.collect()
...


@profile
def profile_dummy_clean():
data = [
"Hello, this is a sample text with\nnewlines.",
"Visit https://example.com for more info!",
"Send your feedback to [email protected]",
"<p>This is an HTML paragraph.</p>",
"<xml>This is some XML content.</xml>",
"😃 Removing emoticons and emojis 😊 🚀",
"This text has infrequent punctuations: !?#",
"Multiple spaces between words.",
] * 100000

cleaned_data = dummy_clean(data)
gc.collect()
del cleaned_data
del data


if __name__ == "__main__":
profile_extreme_clean()
# profile_extreme_clean()
# profile_dummy_clean()
2 changes: 1 addition & 1 deletion texy/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.2-alpha"
__version__ = "0.0.2"
21 changes: 19 additions & 2 deletions texy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Any, Callable, List, Tuple

from .texy import extreme_clean, relaxed_clean, strict_clean # noqa: F401
from .texy import extreme_clean as _extreme_clean
from .texy import relaxed_clean as _relaxed_clean
from .texy import strict_clean as _strict_clean


def _apply_strategy(
Expand All @@ -14,7 +16,7 @@ def _apply_strategy(
def parallelize(
strategy: Callable[[List[str]], List[str]], data: List[str], max_workers: int
) -> List[str]:
"""Using this function is not recommended for most of the cases."""
"""Parallelize a pipeline with Python multiprocessing."""
if not max_workers:
max_workers = multiprocessing.cpu_count()
batch_size: int = max(len(data) // max_workers, 1)
Expand All @@ -37,3 +39,18 @@ def parallelize(
for i in store:
result.extend(i[1])
return result


def extreme_clean(data: List[str]) -> List[str]:
"""Extreme cleaning pipeline."""
return parallelize(_extreme_clean, data, 0)


def strict_clean(data: List[str]) -> List[str]:
"""Strict cleaning pipeline."""
return parallelize(_strict_clean, data, 0)


def relaxed_clean(data: List[str]) -> List[str]:
"""Relaxed cleaning pipeline."""
return parallelize(_relaxed_clean, data, 0)
2 changes: 1 addition & 1 deletion typings/texy/pipelines.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ from typing import Callable, List
def parallelize(
strategy: Callable[[List[str]], List[str]], data: List[str], max_workers: int
) -> List[str]:
"""Using this function is not recommended for most of the cases."""
"""Parallelize a pipeline with Python multiprocessing."""
...

def relaxed_clean(data: List[str]) -> List[str]: ...
Expand Down

0 comments on commit 51e44cd

Please sign in to comment.