Skip to content

Commit

Permalink
db_path -> db_url, note on postgres users
Browse files Browse the repository at this point in the history
  • Loading branch information
parkervg committed May 14, 2024
1 parent d81a7e5 commit 2f5530c
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 24 deletions.
30 changes: 25 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,27 @@ A PostgreSQL database connection.
Can be initialized via the SQLAlchemy input string.
https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql

#### Creating a `blendsql` User

When executing a BlendSQL query, there are internal checks to ensure prior to execution that a given query does not contain any 'modify' actions.

However, it is still best practice when using PostgreSQL to create a dedicated 'blendsql' user with only the permissions needed.

You can create a user with the required permissions with the script below (after invoking postgres via `psql`)

```bash
CREATE USER blendsql;
GRANT pg_read_all_data TO blendsql;
GRANT TEMP ON DATABASE mydb TO blendsql;
```

Now, we can initialize a PostgreSQL database with our new user.

```python
from blendsql.db import PostgreSQL
db = PostgreSQL("blendsql@localhost:5432/mydb")
```

Example:
```python
from blendsql.db import PostgreSQL
Expand Down Expand Up @@ -459,15 +480,14 @@ class Smoothie:

@dataclass
class SmoothieMeta:
process_time_seconds: float
num_values_passed: int # Number of values passed to a Map/Join/QA ingredient
num_prompt_tokens: int # Number of prompt tokens (counting user and assistant, i.e. input/output)
prompts: List[str] # Log of prompts submitted to model
example_map_outputs: List[Any] # outputs from a Map ingredient, for debugging
ingredients: List[Ingredient]
prompts: List[str] # Log of prompts submitted to model
ingredients: Collection[Ingredient]
query: str
db_path: str
db_url: str
contains_ingredient: bool = True
process_time_seconds: float = None

def blend(*args, **kwargs) -> Smoothie:
...
Expand Down
2 changes: 1 addition & 1 deletion blendsql/_smoothie.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class SmoothieMeta:
prompts: List[str] # Log of prompts submitted to model
ingredients: Collection[Ingredient]
query: str
db_path: str
db_url: str
contains_ingredient: bool = True
process_time_seconds: float = None

Expand Down
6 changes: 3 additions & 3 deletions blendsql/blend.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def _blend(
prompts=blender.prompts if blender is not None else [],
ingredients=[],
query=original_query,
db_path=db.db_path,
db_url=str(db.db_url),
contains_ingredient=False,
),
)
Expand All @@ -459,7 +459,7 @@ def _blend(
session_modified_tables = set()
# TODO: Currently, as we traverse upwards from deepest subquery,
# if any lower subqueries have an ingredient, we deem the current
# as inelligible for optimization. Maybe this can be improved in the future.
# as ineligible for optimization. Maybe this can be improved in the future.
prev_subquery_has_ingredient = False
for subquery_idx, subquery in enumerate(get_reversed_subqueries(_query)):
# At this point, we should have already handled cte statements and created associated tables
Expand Down Expand Up @@ -822,7 +822,7 @@ def _blend(
prompts=blender.prompts if blender is not None else [],
ingredients=ingredients,
query=original_query,
db_path=db.db_path,
db_url=str(db.db_url),
),
)

Expand Down
6 changes: 3 additions & 3 deletions blendsql/blend_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def main():

_ = readline
parser = argparse.ArgumentParser()
parser.add_argument("db_path", nargs="?")
parser.add_argument("db_url", nargs="?")
parser.add_argument("secrets_path", nargs="?", default="./secrets.json")
args = parser.parse_args()

db = SQLite(db_path=args.db_path)
print_msg_box(f"Beginning BlendSQL session with '{args.db_path}'...")
db = SQLite(db_url=args.db_url)
print_msg_box(f"Beginning BlendSQL session with '{args.db_url}'...")
print()
while True:
lines = []
Expand Down
7 changes: 3 additions & 4 deletions blendsql/db/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sqlalchemy.schema import CreateTable
from sqlalchemy import create_engine, inspect, MetaData
from sqlalchemy.sql import text
from sqlalchemy.engine import Engine, Connection
from sqlalchemy.engine import Engine, Connection, URL
from pandas.io.sql import get_schema
from abc import abstractmethod

Expand All @@ -17,16 +17,15 @@

@attrs(auto_detect=True)
class Database:
db_path: str = attrib()
db_prefix: str = attrib()
db_url: URL = attrib()

engine: Engine = attrib(init=False)
con: Connection = attrib(init=False)
all_tables: List[str] = attrib(init=False)
tablename_to_columns: Dict[str, Iterable] = attrib(init=False)

def __attrs_post_init__(self):
self.engine = create_engine(f"{self.db_prefix}{self.db_path}")
self.engine = create_engine(self.db_url)
self.con = self.engine.connect()
self.metadata = MetaData()
self.metadata.reflect(bind=self.engine)
Expand Down
11 changes: 10 additions & 1 deletion blendsql/db/_postgres.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import importlib.util
from sqlalchemy.engine import make_url, URL
from colorama import Fore
import logging

from ._database import Database

Expand All @@ -22,7 +25,13 @@ def __init__(self, db_path: str):
raise ImportError(
"Please install psycopg2 with `pip install psycopg2-binary`!"
) from None
super().__init__(db_path=db_path, db_prefix="postgresql+psycopg2://")
db_url: URL = make_url(f"postgresql+psycopg2://{db_path}")
if db_url.username is None:
logging.warning(
Fore.RED
+ "Connecting to postgreSQL database without specifying user!\nIt is strongly encouraged to create a `blendsql` user with read-only permissions and temp table creation privileges."
)
super().__init__(db_url=db_url)

def has_temp_table(self, tablename: str) -> bool:
return (
Expand Down
6 changes: 4 additions & 2 deletions blendsql/db/_sqlite.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from sqlalchemy.engine import make_url, URL

from .utils import double_quote_escape
from ._database import Database
Expand All @@ -15,8 +16,9 @@ class SQLite(Database):
```
"""

def __init__(self, db_path: str):
super().__init__(db_path=Path(db_path).resolve(), db_prefix="sqlite:///")
def __init__(self, db_url: str):
db_url: URL = make_url(f"sqlite:///{Path(db_url).resolve()}")
super().__init__(db_url=db_url)

def has_temp_table(self, tablename: str) -> bool:
return (
Expand Down
23 changes: 22 additions & 1 deletion docs/reference/databases/postgresql.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,25 @@

::: blendsql.db._postgres.PostgreSQL
handler: python
show_source: true
show_source: true

## Creating a `blendsql` User

When executing a BlendSQL query, there are internal checks to ensure prior to execution that a given query does not contain any 'modify' actions.

However, it is still best practice when using PostgreSQL to create a dedicated 'blendsql' user with only the permissions needed.

You can create a user with the required permissions with the script below (after invoking postgres via `psql`)

```bash
CREATE USER blendsql;
GRANT pg_read_all_data TO blendsql;
GRANT TEMP ON DATABASE mydb TO blendsql;
```

Now, we can initialize a PostgreSQL database with our new user.

```python
from blendsql.db import PostgreSQL
db = PostgreSQL("blendsql@localhost:5432/mydb")
```
13 changes: 10 additions & 3 deletions research/run-debug.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from blendsql import blend, LLMJoin
from blendsql.db import SQLite
from blendsql.db import SQLite, PostgreSQL
from blendsql.models import OpenaiLLM
from blendsql.utils import fetch_from_hub, tabulate

Expand All @@ -23,6 +23,13 @@
ingredients={LLMJoin},
)
print(tabulate(smoothie.df))
import json

print(json.dumps(smoothie.meta.prompts, indent=4))
smoothie = blend(
query=blendsql,
db=PostgreSQL(
"blendsql@localhost:5432/1884_New_Zealand_rugby_union_tour_of_New_South_Wales_1"
),
blender=OpenaiLLM("gpt-3.5-turbo"),
ingredients={LLMJoin},
)
print(tabulate(smoothie.df))
2 changes: 1 addition & 1 deletion research/run-evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def main() -> None:
if data_training_args.overwrite_cache:
# Remove the appropriate directory containing our save db files
if data_args.dataset == "wikitq":
dataset_db_path = Path(data_training_args.db_path) / "wikitq"
dataset_db_path = Path(data_training_args.db_url) / "wikitq"
if dataset_db_path.is_dir():
shutil.rmtree(str(dataset_db_path))

Expand Down

0 comments on commit 2f5530c

Please sign in to comment.