From fc59a109dd5e43c291f52fdda05aa4e38c5f111d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Jun 2024 11:16:47 +0200 Subject: [PATCH] ci: cleanup (#140) - **update 13 files and delete 9 files** - **update 8 files, copy 1 file and create 2 files** - **test: update test_markdown.py** - **update Dockerfile and main.py** - **update 11 files and delete 3 files** - **update docker_smoketest.yml, .gitignore, Makefile and __main__.py** - **update 5 files** - **chore: update 1 file and delete 1 file** --- .copier-answers.yml | 4 +- .dockerignore | 162 +++++++++++++++++ .env.sample | 8 +- .github/Dockerfile.dev | 7 +- .github/workflows/docker_release.yml | 2 +- .github/workflows/docker_smoketest.yml | 8 +- .gitignore | 7 + Dockerfile | 15 +- Makefile | 19 +- README.md | 48 ++--- compose.sample.yml | 11 ++ lefthook.yml | 6 +- memorymarker/__main__.py | 141 +++++++-------- memorymarker/cli/document_selector.py | 16 -- memorymarker/document_providers/base.py | 4 +- .../document_providers/hydrator/main.py | 107 ----------- .../hydrator/test_hydrator.py | 34 ---- memorymarker/document_providers/omnivore.py | 46 ++++- .../document_providers/omnivore_document.py | 44 ----- .../hydrator => persister}/__init__.py | 0 .../__snapshots__/test_markdown.ambr | 0 .../markdown.py | 0 .../test_markdown.py | 2 +- .../__init__.py | 0 memorymarker/question_generator/chunker.py | 28 +++ .../example_repo.py} | 0 .../question_generator/flows/question_flow.py | 8 +- memorymarker/question_generator/main.py | 168 ------------------ .../question_generator/pipeline_runner.py | 38 ---- .../question_generator/steps/qa_extractor.py | 2 +- .../question_generator/steps/qa_generation.py | 2 +- .../steps/question_wikilinker.py | 2 +- .../question_generator/steps/reasoning.py | 2 +- pyproject.toml | 8 +- pyrightconfig.json | 3 +- pytest.ini | 3 + requirements-dev.lock | 8 +- requirements.lock | 6 +- src/memorymarker/__init__.py | 38 ++++ 39 files changed, 435 insertions(+), 572 deletions(-) create mode 100644 .dockerignore create mode 100644 compose.sample.yml delete mode 100644 memorymarker/cli/document_selector.py delete mode 100644 memorymarker/document_providers/hydrator/main.py delete mode 100644 memorymarker/document_providers/hydrator/test_hydrator.py delete mode 100644 memorymarker/document_providers/omnivore_document.py rename memorymarker/{document_providers/hydrator => persister}/__init__.py (100%) rename memorymarker/{persist_questions => persister}/__snapshots__/test_markdown.ambr (100%) rename memorymarker/{persist_questions => persister}/markdown.py (100%) rename memorymarker/{persist_questions => persister}/test_markdown.py (97%) rename memorymarker/{persist_questions => question_generator}/__init__.py (100%) create mode 100644 memorymarker/question_generator/chunker.py rename memorymarker/question_generator/{example_repo_airtable.py => evaluation/example_repo.py} (100%) delete mode 100644 memorymarker/question_generator/main.py delete mode 100644 memorymarker/question_generator/pipeline_runner.py create mode 100644 pytest.ini create mode 100644 src/memorymarker/__init__.py diff --git a/.copier-answers.yml b/.copier-answers.yml index c4c61c5..158af1c 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier; NEVER EDIT MANUALLY -_commit: ae075e2 +_commit: ac0611c _src_path: https://github.com/MartinBernstorff/nimble-python-template email: martinbernstorff@gmail.com full_name: Martin Bernstorff @@ -7,6 +7,6 @@ github_username: MartinBernstorff package_name: memorymarker project_name: memorymarker project_slug: memorymarker -python_version: '3.12' +python_version: '3.11' release_docker_image: true release_package: true diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7dbf7e8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,162 @@ +# macOS +.DS_Store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.whl + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +.ruff_cache + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# Tests +.testmondata* \ No newline at end of file diff --git a/.env.sample b/.env.sample index dc75ca4..8d17342 100644 --- a/.env.sample +++ b/.env.sample @@ -1,4 +1,4 @@ -OMNIVORE_API_KEY="" -OPENAI_API_KEY="" -ANTHROPIC_API_KEY="" -AIRTABLE_PAT="" \ No newline at end of file +OMNIVORE_API_KEY= +OPENAI_API_KEY= +ANTHROPIC_API_KEY= +MAX_N= \ No newline at end of file diff --git a/.github/Dockerfile.dev b/.github/Dockerfile.dev index 271eca6..bd80cf9 100644 --- a/.github/Dockerfile.dev +++ b/.github/Dockerfile.dev @@ -1,4 +1,4 @@ -FROM python:3.12 +FROM python:3.11 # Set the working directory to /app WORKDIR /app @@ -7,16 +7,15 @@ ENV RYE_HOME="/opt/rye" ENV PATH="$RYE_HOME/shims:$PATH" ENV RYE_INSTALL_OPTION="--yes" ENV RYE_TOOLCHAIN="/usr/local/bin/python" -ENV RYE_VERSION=0.26.0 +ENV RYE_VERSION=0.33.0 -RUN curl -sSf https://rye-up.com/get > /tmp/get-rye.sh +RUN curl -sSf https://rye.astral.sh/get > /tmp/get-rye.sh RUN bash /tmp/get-rye.sh RUN rm /tmp/get-rye.sh RUN echo 'source "$HOME/.rye/env"' >> ~/.bashrc RUN rye config --set-bool behavior.use-uv=true RUN rye config --set-bool behavior.global-python=true -RUN rye config --set default.dependency-operator="~=" COPY Makefile ./ COPY pyproject.toml ./ diff --git a/.github/workflows/docker_release.yml b/.github/workflows/docker_release.yml index a49ebaa..81fe292 100644 --- a/.github/workflows/docker_release.yml +++ b/.github/workflows/docker_release.yml @@ -47,4 +47,4 @@ jobs: context: . push: true platforms: linux/amd64,linux/arm64 - tags: ${{steps.meta.outputs.tags }} \ No newline at end of file + tags: ${{ steps.meta.outputs.tags }} \ No newline at end of file diff --git a/.github/workflows/docker_smoketest.yml b/.github/workflows/docker_smoketest.yml index 778ca79..897a470 100644 --- a/.github/workflows/docker_smoketest.yml +++ b/.github/workflows/docker_smoketest.yml @@ -10,7 +10,13 @@ jobs: - name: Checkout (GitHub) uses: actions/checkout@v4 + - name: Get environment variables + run: | + echo "OMNIVORE_API_KEY=${{ secrets.OMNIVORE_API_KEY }}" >> .env + echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env + echo "ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}" >> .env + - name: Run integration test shell: bash run: | - docker build -t memorymarker -f Dockerfile . \ No newline at end of file + make docker-smoketest diff --git a/.gitignore b/.gitignore index 135d907..793da31 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # macOS .DS_Store +# IDEs +.vscode + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -164,3 +167,7 @@ cython_debug/ # Cache omnivore_cache/ profile.html + +*.smoketest* +compose.yml +smoketest_output \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c4bc639..ca12b4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.12 +FROM python:3.11 # Set the working directory to /app WORKDIR /app @@ -7,16 +7,19 @@ ENV RYE_HOME="/opt/rye" ENV PATH="$RYE_HOME/shims:$PATH" ENV RYE_INSTALL_OPTION="--yes" ENV RYE_TOOLCHAIN="/usr/local/bin/python" -ENV RYE_VERSION=0.26.0 +ENV RYE_VERSION=0.33.0 -RUN curl -sSf https://rye-up.com/get > /tmp/get-rye.sh +RUN curl -sSf https://rye.astral.sh/get > /tmp/get-rye.sh RUN bash /tmp/get-rye.sh RUN rm /tmp/get-rye.sh RUN echo 'source "$HOME/.rye/env"' >> ~/.bashrc RUN rye config --set-bool behavior.use-uv=true RUN rye config --set-bool behavior.global-python=true -RUN rye config --set default.dependency-operator="~=" -COPY . /app -RUN make quicksync +COPY pyproject.toml requirements.lock requirements-dev.lock ./ +RUN rye sync --no-lock + +COPY . /app/ +RUN rye sync --no-lock +ENTRYPOINT ["python", "-m", "memorymarker"] \ No newline at end of file diff --git a/Makefile b/Makefile index d487867..962c5a2 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ quicksync: rye sync --no-lock test: - @rye run pytest --cov=$(SRC_PATH) $(SRC_PATH) --cov-report xml:.coverage.xml --cov-report lcov:.coverage.lcov --testmon + rye test test-with-coverage: @echo "––– Testing –––" @@ -24,9 +24,7 @@ test-with-coverage: lint: ## Format code @echo "––– Linting –––" @rye run ruff format . - @rye run ruff . --fix --unsafe-fixes \ - --extend-select F401 \ - --extend-select F841 + @rye run ruff . --fix --unsafe-fixes @echo "✅✅✅ Lint ✅✅✅" types: ## Type-check code @@ -46,12 +44,19 @@ docker_ci: ## Run all checks in docker docker build -t memorymarker_ci -f .github/Dockerfile.dev . docker run --env-file .env memorymarker_ci make validate_ci -pr: ## Submit a PR - @lumberman sync --squash --automerge - ######################### # End template makefile # ######################### +docker-smoketest: + cp compose.sample.yml compose.smoketest.yml + perl -pi -e 's#YOUR_OUTPUT_DIR#./smoketest_output#' compose.smoketest.yml + + cp .env .env.smoketest + echo "MAX_N=1" >> .env.smoketest + + docker build . -t ghcr.io/martinbernstorff/memorymarker:latest + docker compose -f compose.smoketest.yml --env-file .env.smoketest up + update-snapshots: @rye run pytest --snapshot-update diff --git a/README.md b/README.md index 512039a..157a1b1 100644 --- a/README.md +++ b/README.md @@ -2,61 +2,49 @@ # memorymarker -[![PyPI](https://img.shields.io/pypi/v/memorymarker.svg)][pypi status] -[![Python Version](https://img.shields.io/pypi/pyversions/memorymarker)][pypi status] -[![documentation](https://github.com/martinbernstorff/memorymarker/actions/workflows/documentation.yml/badge.svg)][documentation] -[![Tests](https://github.com/martinbernstorff/memorymarker/actions/workflows/tests.yml/badge.svg)][tests] -[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)][black] - -[pypi status]: https://pypi.org/project/memorymarker/ -[documentation]: https://martinbernstorff.github.io/memorymarker/ -[tests]: https://github.com/martinbernstorff/memorymarker/actions?workflow=Tests -[black]: https://github.com/psf/black + +Highlighting does not aid memory. Questions do. But they take time. MemoryMarker turns your highlights into questions, so you can maintain traction at speed. - +Specifically, it takes highlights from [Omnivore](https://www.omnivore.app/) and turns them into markdown questions. -TODO: Figure out github actions and add description +To supercharge this, you can even ingest these questions into [Anki](https://apps.ankiweb.net/) using [Memium](https://github.com/MartinBernstorff/Memium). -## Installation +## Setup -You can install `memorymarker` via [pip] from [PyPI]: - -```bash -pip install memorymarker -``` +A Docker image for Omnivore is continuously built and pushed to [ghcr.io/martinbernstorff/memorymarker](https://github.com/martinbernstorff/memorymarker/pkgs/container/memorymarker). -[pip]: https://pip.pypa.io/en/stable/installing/ -[PyPI]: https://pypi.org/project/memorymarker/ +1. Install [Docker](https://docs.docker.com/get-docker/) or [Orbstack](https://orbstack.dev/) -## Usage +2. Update the api keys in the `.env` file -TODO: Add minimal usage example +3. Run the container: -To see more examples, see the [documentation]. +```bash +docker compose up +``` # 📖 Documentation -| Documentation | | -| --------------------- | -------------------------------------------------------- | +| Documentation | | +| ---------------------- | -------------------------------------------------------- | | 🔧 **[Installation]** | Installation instructions on how to install this package | | 📖 **[Documentation]** | A minimal and developing documentation | | 👩‍💻 **[Tutorials]** | Tutorials for using this package | | 🎛️ **[API Reference]** | API reference for this package | | 📚 **[FAQ]** | Frequently asked questions | - # 💬 Where to ask questions -| Type | | -| ------------------------------ | ---------------------- | +| Type | | +| ------------------------------- | ---------------------- | | 📚 **FAQ** | [FAQ] | | 🚨 **Bug Reports** | [GitHub Issue Tracker] | | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] | | 👩‍💻 **Usage Questions** | [GitHub Discussions] | -| 🗯 **General Discussion** | [GitHub Discussions] | +| 🗯 **General Discussion** | [GitHub Discussions] | [Documentation]: https://martinbernstorff.github.io/memorymarker/index.html [Installation]: https://martinbernstorff.github.io/memorymarker/installation.html @@ -65,5 +53,3 @@ To see more examples, see the [documentation]. [FAQ]: https://martinbernstorff.github.io/memorymarker/faq.html [github issue tracker]: https://github.com/martinbernstorff/memorymarker/issues [github discussions]: https://github.com/martinbernstorff/memorymarker/discussions - - diff --git a/compose.sample.yml b/compose.sample.yml new file mode 100644 index 0000000..e507ac2 --- /dev/null +++ b/compose.sample.yml @@ -0,0 +1,11 @@ +services: + memorymarker: + image: ghcr.io/martinbernstorff/memorymarker:latest + container_name: memorymarker + volumes: + - YOUR_OUTPUT_DIR:/output + environment: + - OMNIVORE_API_KEY=${OMNIVORE_API_KEY} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - MAX_N=${MAX_N} diff --git a/lefthook.yml b/lefthook.yml index 902d225..414fcd3 100644 --- a/lefthook.yml +++ b/lefthook.yml @@ -3,13 +3,9 @@ pre-commit: commands: - format: - glob: "*.{py}" - run: rye run ruff format {staged_files} - stage_fixed: true lint: glob: "*.{py}" - run: rye run ruff --fix --extend-select F401 --extend-select F841 --extend-select B007 {staged_files} + run: make lint stage_fixed: true post-checkout: diff --git a/memorymarker/__main__.py b/memorymarker/__main__.py index 5148bfc..f1cbe02 100644 --- a/memorymarker/__main__.py +++ b/memorymarker/__main__.py @@ -2,10 +2,8 @@ import datetime as dt import logging import os -import time from dataclasses import dataclass from pathlib import Path -from typing import Callable import coloredlogs import pytz @@ -13,9 +11,9 @@ from dotenv import load_dotenv from iterpy.iter import Iter -from memorymarker.cli.document_selector import select_documents from memorymarker.document_providers.omnivore import Omnivore -from memorymarker.persist_questions.markdown import highlight_group_to_file +from memorymarker.persister.markdown import highlight_group_to_file +from memorymarker.question_generator.chunker import chunk_highlights from memorymarker.question_generator.completers.anthropic_completer import ( AnthropicCompleter, ) @@ -24,14 +22,11 @@ OpenAIModelCompleter, ) from memorymarker.question_generator.flows.question_flow import QuestionFlow -from memorymarker.question_generator.main import chunk_highlights from memorymarker.question_generator.qa_responses import QAResponses -from memorymarker.question_generator.steps.qa_extractor import QuestionExtractionStep -from memorymarker.question_generator.steps.qa_generation import QuestionGenerationStep -from memorymarker.question_generator.steps.question_wikilinker import ( - QuestionWikilinkerStep, -) -from memorymarker.question_generator.steps.reasoning import ReasoningStep +from memorymarker.question_generator.steps.qa_extractor import QuestionExtractor +from memorymarker.question_generator.steps.qa_generation import QuestionGenerator +from memorymarker.question_generator.steps.question_wikilinker import QuestionWikilinker +from memorymarker.question_generator.steps.reasoning import Reasoning app = typer.Typer(no_args_is_help=True) @@ -43,7 +38,9 @@ def get_api_key_from_env(env_var: str) -> str | None: @dataclass(frozen=True) -class TimestampHandler: +class TimestampRepository: + """Writes and gets a timestamp for syncing.""" + filepath: Path def update_timestamp(self) -> None: @@ -52,54 +49,61 @@ def update_timestamp(self) -> None: self.filepath.write_text(dt.datetime.now(pytz.UTC).isoformat()) - def get_timestamp(self) -> dt.datetime | None: + def get_timestamp(self) -> dt.datetime: + """Returns the last run timestamp or a value far in the past if it doesn't exist.""" try: return dt.datetime.fromisoformat(self.filepath.read_text()) except FileNotFoundError: - return None - - -def sleep_and_run(sleep_time: int, run_func: Callable[[], None]) -> None: - time.sleep(sleep_time) - run_func() + logging.info( + "No last run timestamp found, generating questions for all highlights" + ) + return dt.datetime(1970, 1, 1, tzinfo=pytz.UTC) @app.command() # type: ignore def typer_cli( - omnivore_api_key: str = typer.Option( - None, help="Omnivore API key", envvar="OMNIVORE_API_KEY" + omnivore_api_key: str = typer.Argument( + help="Omnivore API key", envvar="OMNIVORE_API_KEY" ), - openai_api_key: str = typer.Option( - None, help="Anthropic API key", envvar="OPENAI_API_KEY" + openai_api_key: str = typer.Argument( + help="OpenAI API key", envvar="OPENAI_API_KEY" ), - anthropic_api_key: str = typer.Option( - None, help="Anthropic API key", envvar="ANTHROPIC_API_KEY" + anthropic_api_key: str = typer.Argument( + help="Anthropic API key", envvar="ANTHROPIC_API_KEY" ), output_dir: Path = typer.Argument( # noqa: B008 # type: ignore - Path("questions"), + Path("/output"), help="Directory to save the generated questions to", file_okay=False, dir_okay=True, writable=True, ), - run_every: int = typer.Option( - None, help="How often to run the script in seconds", envvar="RUN_EVERY" - ), max_n: int = typer.Argument( - 1, help="Maximum number of questions to generate from highlights" + help="Maximum number of questions in total", envvar="MAX_N" ), only_new: bool = typer.Option( True, help="Only generate questions from highlights since last run" ), - select: bool = typer.Option( - False, help="Prompt to select which documents to generate questions from" + log_level: str = typer.Option( + "INFO", + help="Log level", + case_sensitive=False, + show_default=True, + envvar="LOG_LEVEL", ), ) -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y/&m/%d %H:%M:%S", + filename="main.log", + ) + coloredlogs.install(level=log_level) # type: ignore + output_dir.mkdir(exist_ok=True, parents=True) - last_run_timestamper = TimestampHandler(output_dir / ".memorymarker") - last_run_timestamp = last_run_timestamper.get_timestamp() logging.info(f"MemoryMarker version {version('memorymarker')}") + logging.info("Fetching documents") documents = ( Omnivore(omnivore_api_key) @@ -107,54 +111,51 @@ def typer_cli( .filter(lambda _: len(_.highlights) > 0) ) - if select: - documents = select_documents(documents) - - logging.info("Processing to highlights") + # Extract highlights from documents highlights = documents.map(lambda _: _.get_highlights()).flatten() + last_run_timestamper = TimestampRepository(output_dir / ".memorymarker") if only_new: - if not last_run_timestamp: - logging.info( - "No last run timestamp found, generating questions for all highlights" - ) - last_run_timestamp = dt.datetime(1970, 1, 1, tzinfo=pytz.UTC) - + last_run_timestamp = last_run_timestamper.get_timestamp() logging.info( f"Last run at UTC {last_run_timestamp.strftime('%Y-%m-%d %H:%M:%S')}" ) highlights = highlights.filter(lambda _: _.updated_at > last_run_timestamp) - if highlights.count() == 0: - logging.info("No new highlights since last run") - if not run_every: - return - + if highlights.count() == 0: + logging.info("No new highlights since last run") + return logging.info(f"Received {highlights.count()} new highlights") + logging.info( + f"max_n is set to {max_n}, so processing {min(max_n, highlights.count())} highlights" + ) + # Chunk highlights for better reasoning and fewer duplicate questions logging.info("Generating questions from highlights...") - base_completer = AnthropicCompleter( - api_key=anthropic_api_key, model="claude-3-opus-20240229" - ) chunked_highlights = ( highlights.groupby(lambda _: _.source_document.title) .map(lambda _: chunk_highlights(_, 5)) .flatten() ) + + # Generate questions + base_completer = AnthropicCompleter( + api_key=anthropic_api_key, model="claude-3-opus-20240229" + ) questions = asyncio.run( QuestionFlow( - _name="simplified_reasoning", + name="simplified_reasoning", steps=( - ReasoningStep(completer=base_completer), - QuestionGenerationStep(completer=base_completer, n_questions=(1, 5)), - QuestionExtractionStep( + Reasoning(completer=base_completer), + QuestionGenerator(completer=base_completer, n_questions=(1, 5)), + QuestionExtractor( completer=OpenAIModelCompleter( api_key=openai_api_key, model="gpt-3.5-turbo", response_model=QAResponses, # type: ignore ) ), - QuestionWikilinkerStep( + QuestionWikilinker( completer=OpenAICompleter( api_key=os.getenv("OPENAI_API_KEY", "No OPENAI_API"), model="gpt-4-turbo-preview", @@ -164,36 +165,14 @@ def typer_cli( )(chunked_highlights[0:max_n]) ) + # Write to disk logging.info("Writing questions to markdown...") - - highlight_groups = Iter(questions[0:max_n]).groupby( - lambda _: _.source_document.title - ) - for group in highlight_groups: + for group in questions.groupby(lambda _: _.source_document.title): highlight_group_to_file(output_dir, group) last_run_timestamper.update_timestamp() - if run_every: - logging.info(f"Running every {run_every} seconds") - time.sleep(run_every) - logging.info("Running again") - typer_cli( - omnivore_api_key=omnivore_api_key, - output_dir=output_dir, - run_every=run_every, - max_n=max_n, - only_new=only_new, - select=select, - ) if __name__ == "__main__": load_dotenv() - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", - datefmt="%Y/&m/%d %H:%M:%S", - filename="main.log", - ) - coloredlogs.install(level="DEBUG") # type: ignore app() diff --git a/memorymarker/cli/document_selector.py b/memorymarker/cli/document_selector.py deleted file mode 100644 index 08c3e19..0000000 --- a/memorymarker/cli/document_selector.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import TYPE_CHECKING - -import questionary - -if TYPE_CHECKING: - from iterpy.iter import Iter - - from ..document_providers.omnivore_document import OmnivoreDocument - - -def select_documents(docs: "Iter[OmnivoreDocument]") -> "Iter[OmnivoreDocument]": - doc_titles = docs.map(lambda d: d.title).to_list() - selected_doc_names = questionary.checkbox( - message="Select documents", choices=doc_titles - ).ask() - return docs.filter(lambda d: d.title in selected_doc_names) diff --git a/memorymarker/document_providers/base.py b/memorymarker/document_providers/base.py index 06a7a68..163e929 100644 --- a/memorymarker/document_providers/base.py +++ b/memorymarker/document_providers/base.py @@ -9,11 +9,13 @@ from memorymarker.question_generator.reasoned_highlight import Highlights - from .omnivore_document import OmnivoreDocument + from .omnivore import OmnivoreDocument @dataclass(frozen=True) class OrphanHighlight: + """Highlight without a source document""" + highlight: str uri: str title: str diff --git a/memorymarker/document_providers/hydrator/main.py b/memorymarker/document_providers/hydrator/main.py deleted file mode 100644 index 2109abc..0000000 --- a/memorymarker/document_providers/hydrator/main.py +++ /dev/null @@ -1,107 +0,0 @@ -import logging -import re -from typing import TYPE_CHECKING, Callable, Sequence -from urllib.request import urlopen - -import requests -from bs4 import BeautifulSoup, NavigableString, Tag -from joblib import Memory - -from memorymarker.question_generator.reasoned_highlight import Highlights - -if TYPE_CHECKING: - from memorymarker.document_providers.base import OrphanHighlight - -memory = Memory(".soup_download_cache", verbose=0) - - -@memory.cache() # type: ignore -def download_soup_from_url(url: str) -> BeautifulSoup: - # Send HTTP request to URL and save the response from server in a response object called r - r = requests.get(url) - - # Create a BeautifulSoup object and specify the parser - soup = BeautifulSoup(r.text, "html.parser") - return soup - - -class ContextParser: - @staticmethod - def get_highlight_context( - soup: BeautifulSoup, - highlight: str, - n_chars_before: int = 100, - n_chars_after: int = 100, - ) -> str: - highlight_selection = soup.find(text=re.compile(highlight)) - - if highlight_selection is None: - logging.info(f"Could not find highlight {highlight} in {soup.title}") - return "" - - highlight_container: Tag = highlight_selection.parent.parent # type: ignore - - context_strings: list[str] = [] - - for child in highlight_container.descendants: - if isinstance(child, NavigableString): - context_strings.append(str(child)) - - context = " ".join(context_strings) - - context = ContextParser._select_context_slice( - highlight=highlight, - n_chars_before=n_chars_before, - n_chars_after=n_chars_after, - context=context, - ) - return context - - @staticmethod - def _select_context_slice( - highlight: str, n_chars_before: int, n_chars_after: int, context: str - ) -> str: - highlight_index = context.find(highlight) - context_start_index = max(0, highlight_index - n_chars_before) - context_end_index = min( - len(context), highlight_index + len(highlight) + n_chars_after - ) - - return context[context_start_index:context_end_index] - - -class HighlightHydrator: - def __init__(self, soup_downloader: Callable[[str], BeautifulSoup]) -> None: - self.soup_downloader = soup_downloader - - def hydrate_highlights( - self, highlights: Sequence["OrphanHighlight"] - ) -> Sequence[Highlights | None]: - hydrated_highlights: list[Highlights | None] = [] - for highlight in highlights: - try: - page = urlopen(highlight.uri) - except Exception: - logging.info(f"Could not open {highlight.uri}") - hydrated_highlights.append(None) - continue - - soup = self.soup_downloader(page) - context = ContextParser.get_highlight_context( - soup=soup, highlight=highlight.highlight - ) - hydrated_highlights.append( - Highlights( - highlighted_text=highlight.highlight, - prefix=context[:100], - suffix=context[-100:], - ) # type: ignore - ) - - return hydrated_highlights - - -if __name__ == "__main__": - result = download_soup_from_url( - "https://www.gutenberg.org/files/2701/2701-h/2701-h.htm" - ) diff --git a/memorymarker/document_providers/hydrator/test_hydrator.py b/memorymarker/document_providers/hydrator/test_hydrator.py deleted file mode 100644 index bb35592..0000000 --- a/memorymarker/document_providers/hydrator/test_hydrator.py +++ /dev/null @@ -1,34 +0,0 @@ -from bs4 import BeautifulSoup - -from memorymarker.document_providers.hydrator.main import ContextParser - - -def test_context_parser(): - input_soup = BeautifulSoup( - """ - - -

Some text

-

Some more text

-

Even more text

- - - """, - "html.parser", - ) - - expected_output = "\n Some text \n Some more text \n Even more text \n" - context = ContextParser.get_highlight_context(soup=input_soup, highlight="more") - assert context == expected_output - - -def test_context_slicing(): - highlight = "highlight" - context = "54321highlight12345" - - assert ( - ContextParser._select_context_slice( # type: ignore - highlight=highlight, n_chars_before=1, n_chars_after=1, context=context - ) - == "1highlight1" - ) diff --git a/memorymarker/document_providers/omnivore.py b/memorymarker/document_providers/omnivore.py index 91df260..775d437 100644 --- a/memorymarker/document_providers/omnivore.py +++ b/memorymarker/document_providers/omnivore.py @@ -1,14 +1,54 @@ +import os from dataclasses import dataclass -from typing import Mapping +from typing import Any, Mapping, Sequence from iterpy.iter import Iter from omnivoreql import OmnivoreQL +from pydantic import BaseModel -from memorymarker.document_providers.omnivore_document import OmnivoreDocument +from memorymarker.question_generator.reasoned_highlight import ( + Highlights, + SourceDocument, +) from .base import DocumentProvider +def _empty_string_if_none(value: str | None) -> str: + return value or "" + + +class OmnivoreDocument(BaseModel): + title: str + uri: str + slug: str + highlights: Sequence[Mapping[str, Any]] + + def _parse_highlight(self, highlight: Mapping[str, str]) -> Highlights | None: + if "quote" not in highlight or highlight["quote"] is None: # type: ignore + return None + + return Highlights( + source_document=SourceDocument( + title=self.title, + uri=f"https://omnivore.app/me/{self.slug}#{highlight['id']}", + ), + pipeline_name="", + reasoning_prompt="", + reasoning="", + qa_string="", + question_answer_pairs=[], + highlighted_text=highlight["quote"], + prefix=_empty_string_if_none(highlight["prefix"]), + suffix=_empty_string_if_none(highlight["suffix"]), + updated_at=highlight["updatedAt"], # type: ignore # Will be recast on init. + ) + + def get_highlights(self) -> Iter[Highlights]: + highlights = Iter(self.highlights).map(self._parse_highlight) + return highlights.filter(lambda _: _ is not None) # type: ignore + + @dataclass class Omnivore(DocumentProvider): api_key: str @@ -26,7 +66,7 @@ def _parse_doc(self, document: Mapping[str, str]) -> OmnivoreDocument: def get_documents(self) -> Iter[OmnivoreDocument]: documents = ( - Iter(self.client.get_articles(limit=1000)["search"]["edges"]) + Iter(self.client.get_articles(limit=100)["search"]["edges"]) .map(lambda a: a["node"]) .map(self._parse_doc) .flatten() diff --git a/memorymarker/document_providers/omnivore_document.py b/memorymarker/document_providers/omnivore_document.py deleted file mode 100644 index 7646040..0000000 --- a/memorymarker/document_providers/omnivore_document.py +++ /dev/null @@ -1,44 +0,0 @@ -from typing import Any, Mapping, Sequence - -from iterpy.iter import Iter -from pydantic import BaseModel - -from memorymarker.question_generator.reasoned_highlight import ( - Highlights, - SourceDocument, -) - - -def empty_string_if_none(value: str | None) -> str: - return value or "" - - -class OmnivoreDocument(BaseModel): - title: str - uri: str - slug: str - highlights: Sequence[Mapping[str, Any]] - - def _parse_highlight(self, highlight: Mapping[str, str]) -> Highlights | None: - if "quote" not in highlight or highlight["quote"] is None: # type: ignore - return None - - return Highlights( - source_document=SourceDocument( - title=self.title, - uri=f"https://omnivore.app/me/{self.slug}#{highlight["id"]}", - ), - pipeline_name="", - reasoning_prompt="", - reasoning="", - qa_string="", - question_answer_pairs=[], - highlighted_text=highlight["quote"], - prefix=empty_string_if_none(highlight["prefix"]), - suffix=empty_string_if_none(highlight["suffix"]), - updated_at=highlight["updatedAt"], # type: ignore # Will be recast on init. - ) - - def get_highlights(self) -> Iter[Highlights]: - highlights = Iter(self.highlights).map(self._parse_highlight) - return highlights.filter(lambda _: _ is not None) # type: ignore diff --git a/memorymarker/document_providers/hydrator/__init__.py b/memorymarker/persister/__init__.py similarity index 100% rename from memorymarker/document_providers/hydrator/__init__.py rename to memorymarker/persister/__init__.py diff --git a/memorymarker/persist_questions/__snapshots__/test_markdown.ambr b/memorymarker/persister/__snapshots__/test_markdown.ambr similarity index 100% rename from memorymarker/persist_questions/__snapshots__/test_markdown.ambr rename to memorymarker/persister/__snapshots__/test_markdown.ambr diff --git a/memorymarker/persist_questions/markdown.py b/memorymarker/persister/markdown.py similarity index 100% rename from memorymarker/persist_questions/markdown.py rename to memorymarker/persister/markdown.py diff --git a/memorymarker/persist_questions/test_markdown.py b/memorymarker/persister/test_markdown.py similarity index 97% rename from memorymarker/persist_questions/test_markdown.py rename to memorymarker/persister/test_markdown.py index 5a9c6f3..46c7dee 100644 --- a/memorymarker/persist_questions/test_markdown.py +++ b/memorymarker/persister/test_markdown.py @@ -3,7 +3,7 @@ import pytest -import memorymarker.persist_questions.markdown as markdown +import memorymarker.persister.markdown as markdown from memorymarker.question_generator.qa_responses import QAPrompt from memorymarker.question_generator.reasoned_highlight import ( Highlights, diff --git a/memorymarker/persist_questions/__init__.py b/memorymarker/question_generator/__init__.py similarity index 100% rename from memorymarker/persist_questions/__init__.py rename to memorymarker/question_generator/__init__.py diff --git a/memorymarker/question_generator/chunker.py b/memorymarker/question_generator/chunker.py new file mode 100644 index 0000000..42a8ccf --- /dev/null +++ b/memorymarker/question_generator/chunker.py @@ -0,0 +1,28 @@ +from typing import TYPE_CHECKING, Sequence + +from joblib import Memory + +if TYPE_CHECKING: + from memorymarker.question_generator.reasoned_highlight import Highlights + +omnivore_cache = Memory(".cache/omnivore") + + +def chunk_highlights( + group: tuple[str, Sequence["Highlights"]], chunk_size: int +) -> Sequence["Highlights"]: + groups: Sequence["Highlights"] = [] + + for i in range(0, len(group[1]), 5): + subset: Sequence["Highlights"] = group[1][i : i + chunk_size] + combined_text = "\n---\n".join( + f"> {_.prefix}{_.highlighted_text}{_.suffix}" + for _ in subset + ) + new_highlight = subset[-1] + new_highlight.highlighted_text = combined_text + new_highlight.prefix = "" + new_highlight.suffix = "" + groups.append(new_highlight) + + return groups diff --git a/memorymarker/question_generator/example_repo_airtable.py b/memorymarker/question_generator/evaluation/example_repo.py similarity index 100% rename from memorymarker/question_generator/example_repo_airtable.py rename to memorymarker/question_generator/evaluation/example_repo.py diff --git a/memorymarker/question_generator/flows/question_flow.py b/memorymarker/question_generator/flows/question_flow.py index f72d44f..4b5da4d 100644 --- a/memorymarker/question_generator/flows/question_flow.py +++ b/memorymarker/question_generator/flows/question_flow.py @@ -13,7 +13,7 @@ @dataclass(frozen=True) class QuestionFlow: - _name: str + name: str steps: tuple["FlowStep"] async def _process_item(self, highlight: "Highlights") -> "Highlights": @@ -21,7 +21,7 @@ async def _process_item(self, highlight: "Highlights") -> "Highlights": async with sem: for step in self.steps: result = await step(highlight) - result.pipeline_name = self.name + result.pipeline_name = self.identity return result async def __call__(self, highlights: Iter["Highlights"]) -> Iter["Highlights"]: @@ -32,6 +32,6 @@ async def __call__(self, highlights: Iter["Highlights"]) -> Iter["Highlights"]: return Iter(results) @property - def name(self) -> str: + def identity(self) -> str: step_identites = "_".join(step.identity() for step in self.steps) - return f"{self._name}_{step_identites}" + return f"{self.name}_{step_identites}" diff --git a/memorymarker/question_generator/main.py b/memorymarker/question_generator/main.py deleted file mode 100644 index 10e5725..0000000 --- a/memorymarker/question_generator/main.py +++ /dev/null @@ -1,168 +0,0 @@ -import asyncio -import logging -import os -from dataclasses import dataclass -from typing import TYPE_CHECKING, Sequence - -import coloredlogs -from iterpy.iter import Iter -from joblib import Memory - -from memorymarker.document_providers.omnivore import Omnivore -from memorymarker.question_generator.completers.anthropic_completer import ( - AnthropicCompleter, -) -from memorymarker.question_generator.completers.openai_completer import ( - OpenAICompleter, - OpenAIModelCompleter, -) -from memorymarker.question_generator.example_repo_airtable import ( - AirtableExampleRepo, - PipelineHighlightIdentity, - update_repository, -) -from memorymarker.question_generator.flows.question_flow import QuestionFlow -from memorymarker.question_generator.pipeline_runner import run_pipelines -from memorymarker.question_generator.qa_responses import QAResponses -from memorymarker.question_generator.steps.qa_extractor import QuestionExtractionStep -from memorymarker.question_generator.steps.qa_generation import QuestionGenerationStep -from memorymarker.question_generator.steps.question_wikilinker import ( - QuestionWikilinkerStep, -) -from memorymarker.question_generator.steps.reasoning import ReasoningStep - -if TYPE_CHECKING: - from memorymarker.question_generator.reasoned_highlight import Highlights - -omnivore_cache = Memory(".cache/omnivore") - - -@dataclass(frozen=True) -class HighlightWithPipeline(PipelineHighlightIdentity): - highlight: "Highlights" - pipeline: QuestionFlow - - def identity(self) -> int: - return self.pipeline_highlight_id( - self.pipeline.name, self.highlight.highlighted_text - ) - - -def _generate_highlight_pipeline_pairs( - selected_highlights: Iter["Highlights"], pipelines: Sequence[QuestionFlow] -) -> Iter[HighlightWithPipeline]: - return Iter( - [ - HighlightWithPipeline(highlight=highlight, pipeline=pipeline) - for pipeline in pipelines - for highlight in selected_highlights.to_list() - ] - ) - - -@omnivore_cache.cache() # type: ignore -def _select_highlights_from_omnivore() -> Iter["Highlights"]: - highlights = ( - Omnivore( - api_key=os.getenv("OMNIVORE_API_KEY", "No OMNIVORE_API_KEY in environment") - ) - .get_documents() - .map(lambda _: _.get_highlights().to_list()) - .flatten() - ) - - return highlights - - -def chunk_highlights( - group: tuple[str, Sequence["Highlights"]], chunk_size: int -) -> Sequence["Highlights"]: - groups: Sequence["Highlights"] = [] - - for i in range(0, len(group[1]), 5): - subset: Sequence["Highlights"] = group[1][i : i + chunk_size] - combined_text = "\n---\n".join( - f"> {_.prefix}{_.highlighted_text}{_.suffix}" - for _ in subset - ) - new_highlight = subset[-1] - new_highlight.highlighted_text = combined_text - new_highlight.prefix = "" - new_highlight.suffix = "" - groups.append(new_highlight) - - return groups - - -async def main(): - repository = AirtableExampleRepo() - # content_filter = { - # "drenge og mænd ikke har nogen værdi", - # "The quality of a model", - # "Dependency injection is not effective if", - # "The essence of writing code then is to internalize the problem domain", - # "stack is a data structure that contains a collection of elements where you can add and delete elements from just one end ", - # "A semaphore manages an internal counter", - # } - document_titles = {"Singly Linked List"} - input_highlights = _select_highlights_from_omnivore() - selected_highlights = input_highlights.filter( - lambda _: any(title in _.source_document.title for title in document_titles) - ) - - grouped_highlights = ( - selected_highlights.groupby(lambda _: _.source_document.title) - .map(lambda group: chunk_highlights(group=group, chunk_size=5)) - .flatten() - ) - - old_example_hashes = ( - Iter(repository.get_existing_examples()).map(lambda _: _.__hash__()).to_list() - ) - - base_completer = AnthropicCompleter( - api_key=os.getenv("ANTHROPIC_API_KEY", None), model="claude-3-opus-20240229" - ) - # base_completer = OpenAICompleter( - # api_key=os.getenv("OPENAI_API_KEY", None), model="gpt-4-turbo-preview" - # ) - new_highlights = _generate_highlight_pipeline_pairs( - grouped_highlights, - [ - QuestionFlow( - _name="chunked_reasoning_with_wikilinks", - steps=( - ReasoningStep(completer=base_completer), - QuestionGenerationStep( - completer=base_completer, n_questions=(1, 5) - ), - QuestionExtractionStep( - completer=OpenAIModelCompleter( - api_key=os.getenv("OPENAI_API_KEY", "No OPENAI_API"), - model="gpt-3.5-turbo", - response_model=QAResponses, # type: ignore - ) - ), - QuestionWikilinkerStep( - completer=OpenAICompleter( - api_key=os.getenv("OPENAI_API_KEY", "No OPENAI_API"), - model="gpt-4-turbo-preview", - ) - ), - ), - ) - ], - ).filter(lambda pair: pair.identity() not in old_example_hashes) - - new_responses = await run_pipelines(new_highlights) - update_repository(new_responses, repository=repository) - - -if __name__ == "__main__": - coloredlogs.install( # type: ignore - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", - datefmt="%Y/%m/%d %H:%M:%S", - file_name="tester.log", - ) - asyncio.run(main()) diff --git a/memorymarker/question_generator/pipeline_runner.py b/memorymarker/question_generator/pipeline_runner.py deleted file mode 100644 index 7d1d274..0000000 --- a/memorymarker/question_generator/pipeline_runner.py +++ /dev/null @@ -1,38 +0,0 @@ -import asyncio -from typing import TYPE_CHECKING, Mapping, Sequence - -from iterpy.iter import Iter - -if TYPE_CHECKING: - from memorymarker.question_generator.flows.question_flow import QuestionFlow - from memorymarker.question_generator.main import HighlightWithPipeline - from memorymarker.question_generator.reasoned_highlight import Highlights - - -async def run_pipeline( - pipeline_name: str, - pipelinename2pipeline: Mapping[str, "QuestionFlow"], - highlights: Sequence["Highlights"], -) -> Iter["Highlights"]: - pipeline = pipelinename2pipeline[pipeline_name] - prompts = await pipeline(Iter(highlights)) - return prompts - - -async def run_pipelines(pairs: Iter["HighlightWithPipeline"]) -> Iter["Highlights"]: - pipelinename2pipeline = {pair.pipeline.name: pair.pipeline for pair in pairs} - pipelines_with_highlights = pairs.groupby(lambda _: _.pipeline.name) - - examples = await asyncio.gather( - *[ - run_pipeline( - pipeline_name=pipeline_name, - pipelinename2pipeline=pipelinename2pipeline, - highlights=[pair.highlight], - ) - for pipeline_name, pairs_instance in pipelines_with_highlights - for pair in pairs_instance - ] - ) - - return Iter(examples).flatten() diff --git a/memorymarker/question_generator/steps/qa_extractor.py b/memorymarker/question_generator/steps/qa_extractor.py index e1e208e..19288d2 100644 --- a/memorymarker/question_generator/steps/qa_extractor.py +++ b/memorymarker/question_generator/steps/qa_extractor.py @@ -12,7 +12,7 @@ @dataclass(frozen=True) -class QuestionExtractionStep(FlowStep): +class QuestionExtractor(FlowStep): completer: "ModelCompleter" def identity(self) -> str: diff --git a/memorymarker/question_generator/steps/qa_generation.py b/memorymarker/question_generator/steps/qa_generation.py index bc6d2fa..f7bc56c 100644 --- a/memorymarker/question_generator/steps/qa_generation.py +++ b/memorymarker/question_generator/steps/qa_generation.py @@ -9,7 +9,7 @@ @dataclass(frozen=True) -class QuestionGenerationStep(FlowStep): +class QuestionGenerator(FlowStep): completer: "Completer" n_questions: tuple[int, int] prompt = """You are generating interesting questions. The questions should: diff --git a/memorymarker/question_generator/steps/question_wikilinker.py b/memorymarker/question_generator/steps/question_wikilinker.py index d53bb95..9a91898 100644 --- a/memorymarker/question_generator/steps/question_wikilinker.py +++ b/memorymarker/question_generator/steps/question_wikilinker.py @@ -11,7 +11,7 @@ @dataclass(frozen=True) -class QuestionWikilinkerStep(FlowStep): +class QuestionWikilinker(FlowStep): completer: "Completer" prompt = """In the following, identify the important, domain-specific terms. Then, capitalise them, and surround them with wikilinks. There can be more than one important term. Identify terms as you would in a wikipedia article. diff --git a/memorymarker/question_generator/steps/reasoning.py b/memorymarker/question_generator/steps/reasoning.py index d43353d..23192b2 100644 --- a/memorymarker/question_generator/steps/reasoning.py +++ b/memorymarker/question_generator/steps/reasoning.py @@ -9,7 +9,7 @@ @dataclass(frozen=True) -class ReasoningStep(FlowStep): +class Reasoning(FlowStep): completer: "Completer" prompt = """Document title: {document_title} diff --git a/pyproject.toml b/pyproject.toml index 9e9f488..6944c6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] version = "0.22.1" -requires-python = ">=3.12" +requires-python = ">=3.11" name = "memorymarker" description = "memorymarker" dependencies = [ @@ -8,7 +8,7 @@ dependencies = [ "instructor>=0.6.2", "iterpy>=1.6.0", "joblib>=1.3.2", - "omnivoreql>=0.2.1", + "omnivoreql>=0.3.3", "openai>=1.13.3", "pydantic>=2.6.2", "python-dotenv>=1.0.1", @@ -19,7 +19,7 @@ dependencies = [ "anthropic>=0.21.3", ] authors = [{ name = "Martin Bernstorff", email = "martinbernstorff@gmail.com" }] -classifiers = ["Programming Language :: Python :: 3.12"] +classifiers = ["Programming Language :: Python :: 3.11"] [project.license] file = "LICENSE" @@ -31,7 +31,7 @@ content-type = "text/markdown" [tool] rye = { dev-dependencies = [ "diff-cover==8.0.3", - "pyright==1.1.350", + "pyright>=1.1.368", "pytest>=7.4.0", "pytest-cov==4.1.0", "pytest-xdist==3.5.0", diff --git a/pyrightconfig.json b/pyrightconfig.json index 77723b3..1e66e30 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -7,7 +7,8 @@ "reportMissingTypeStubs": false, "reportMissingParameterType": false, // Covered by ruff ANN "reportUnknownParameterType": false, // Covered by ruff ANN - "reportPrivateUsage": false, // Covered by Ruff PLC + "reportUnusedExpression": false, // Covered by ruff B018 + "reportPrivateUsage": false, // Covered by ruff PLC2701 "reportUntypedFunctionDecorator": false, "reportUnusedImport": "none", // Covered by ruff "typeCheckingMode": "strict" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..83a8caf --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +python_files = *.py +python_functions = test_* _should_* \ No newline at end of file diff --git a/requirements-dev.lock b/requirements-dev.lock index d5aad81..badd755 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -18,6 +18,7 @@ anthropic==0.21.3 # via memorymarker anyio==3.7.1 # via anthropic + # via gql # via httpx # via openai attrs==23.2.0 @@ -56,7 +57,7 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.3.1 # via huggingface-hub -gql==3.4.1 +gql==3.5.0 # via omnivoreql graphql-core==3.2.3 # via gql @@ -99,7 +100,7 @@ multidict==6.0.5 # via yarl nodeenv==1.8.0 # via pyright -omnivoreql==0.2.1 +omnivoreql==0.3.3 # via memorymarker openai==1.13.3 # via instructor @@ -126,7 +127,7 @@ pygments==2.17.2 # via diff-cover # via rich pyinstrument==4.6.2 -pyright==1.1.350 +pyright==1.1.368 pytest==7.4.4 # via pytest-asyncio # via pytest-cov @@ -143,6 +144,7 @@ pytest-testmon==2.1.0 pytest-xdist==3.5.0 python-dotenv==1.0.1 # via memorymarker + # via omnivoreql # via pytest-dotenv pytz==2024.1 # via memorymarker diff --git a/requirements.lock b/requirements.lock index 3f4a979..7d1df40 100644 --- a/requirements.lock +++ b/requirements.lock @@ -18,6 +18,7 @@ anthropic==0.21.3 # via memorymarker anyio==3.7.1 # via anthropic + # via gql # via httpx # via openai attrs==23.2.0 @@ -48,7 +49,7 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.3.1 # via huggingface-hub -gql==3.4.1 +gql==3.5.0 # via omnivoreql graphql-core==3.2.3 # via gql @@ -81,7 +82,7 @@ mdurl==0.1.2 multidict==6.0.5 # via aiohttp # via yarl -omnivoreql==0.2.1 +omnivoreql==0.3.3 # via memorymarker openai==1.13.3 # via instructor @@ -101,6 +102,7 @@ pygments==2.17.2 # via rich python-dotenv==1.0.1 # via memorymarker + # via omnivoreql pytz==2024.1 # via memorymarker pyyaml==6.0.1 diff --git a/src/memorymarker/__init__.py b/src/memorymarker/__init__.py new file mode 100644 index 0000000..40b6cf6 --- /dev/null +++ b/src/memorymarker/__init__.py @@ -0,0 +1,38 @@ +# ############################ +# ## NOTES ON IMPORT FORMAT ## +# ############################ +# +# From https://github.com/dagster-io/dagster/blob/master/python_modules/dagster/dagster/__init__.py +# +# This file defines your package's public API. Imports need to be structured/formatted so as to to ensure +# that the broadest possible set of static analyzers understand your_package's public API as intended. +# The below guidelines ensure this is the case. +# +# (1) All imports in this module intended to define exported symbols should be of the form `from +# your_package.foo import X as X`. This is because imported symbols are not by default considered public +# by static analyzers. The redundant alias form `import X as X` overwrites the private imported `X` +# with a public `X` bound to the same value. It is also possible to expose `X` as public by listing +# it inside `__all__`, but the redundant alias form is preferred here due to easier maintainability. + +# (2) All imports should target the module in which a symbol is actually defined, rather than a +# container module where it is imported. This rule also derives from the default private status of +# imported symbols. So long as there is a private import somewhere in the import chain leading from +# an import to its definition, some linters will be triggered (e.g. pyright). For example, the +# following results in a linter error when using your_package as a third-party library: + +# ### your_package/foo/bar.py +# BAR = "BAR" +# +# ### your_package/foo/__init__.py +# from .bar import BAR # BAR is imported so it is not part of your_package.foo public interface +# FOO = "FOO" +# +# ### your_package/__init__.py +# from .foo import FOO, BAR # importing BAR is importing a private symbol from your_package.foo +# __all__ = ["FOO", "BAR"] +# +# ### some_user_code.py +# # from your_package import BAR # linter error even though `BAR` is in `your_package.__all__`! +# +# We could get around this by always remembering to use the `from .foo import X as X` form in +# containers, but it is simpler to just import directly from the defining module.