Skip to content

Commit

Permalink
finish refactor for pip module. local python server works
Browse files Browse the repository at this point in the history
  • Loading branch information
enjalot committed Jan 29, 2024
1 parent b3d83db commit e842ebc
Show file tree
Hide file tree
Showing 21 changed files with 212 additions and 148 deletions.
9 changes: 5 additions & 4 deletions latentscope/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import json
import pkg_resources
from .providers.transformers import TransformersEmbedProvider, TransformersChatProvider
from .providers.openai import OpenAIEmbedProvider, OpenAIChatProvider
from .providers.mistralai import MistralAIEmbedProvider, MistralAIChatProvider
Expand All @@ -10,8 +11,8 @@

def get_embedding_model(id):
"""Returns a ModelProvider instance for the given model id."""
embed_models_path = os.path.join(os.path.dirname(__file__), "embedding_models.json")
with open(embed_models_path, "r") as f:
embedding_path = pkg_resources.resource_filename('latentscope.models', 'embedding_models.json')
with open(embedding_path, "r") as f:
embed_model_list = json.load(f)
embed_model_dict = {model['id']: model for model in embed_model_list}
model = embed_model_dict[id]
Expand All @@ -34,8 +35,8 @@ def get_embedding_model(id):

def get_chat_model(id):
"""Returns a ModelProvider instance for the given model id."""
chat_models_path = os.path.join(os.path.dirname(__file__), "chat_models.json")
with open(chat_models_path, "r") as f:
chat_path = pkg_resources.resource_filename('latentscope.models', 'chat_models.json')
with open(chat_path, "r") as f:
chat_model_list = json.load(f)
chat_model_dict = {model['id']: model for model in chat_model_list}
model = chat_model_dict[id]
Expand Down
5 changes: 2 additions & 3 deletions latentscope/models/providers/cohereai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
import cohere
from .base import EmbedModelProvider

from dotenv import load_dotenv
load_dotenv()
from latentscope.util import get_key

class CohereAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = cohere.Client(os.getenv("COHERE_API_KEY"))
self.client = cohere.Client(get_key("COHERE_API_KEY"))

def embed(self, inputs):
time.sleep(0.01) # TODO proper rate limiting
Expand Down
7 changes: 3 additions & 4 deletions latentscope/models/providers/mistralai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
from transformers import AutoTokenizer
from .base import EmbedModelProvider,ChatModelProvider

from dotenv import load_dotenv
load_dotenv()
from latentscope.util import get_key

# TODO verify these tokenizers somehow
# derived from:
Expand All @@ -20,7 +19,7 @@

class MistralAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = MistralClient(os.getenv("MISTRAL_API_KEY"))
self.client = MistralClient(get_key("MISTRAL_API_KEY"))

def embed(self, inputs):
time.sleep(0.1) # TODO proper rate limiting
Expand All @@ -29,7 +28,7 @@ def embed(self, inputs):

class MistralAIChatProvider(ChatModelProvider):
def load_model(self):
self.client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY"))
self.client = MistralClient(api_key=get_key("MISTRAL_API_KEY"))
self.encoder = AutoTokenizer.from_pretrained(encoders[self.name])

def chat(self, messages):
Expand Down
7 changes: 3 additions & 4 deletions latentscope/models/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from openai import OpenAI
from .base import EmbedModelProvider, ChatModelProvider

from dotenv import load_dotenv
load_dotenv()
from latentscope.util import get_key

class OpenAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.client = OpenAI(api_key=get_key("OPENAI_API_KEY"))
self.encoder = tiktoken.encoding_for_model(self.name)

def embed(self, inputs):
Expand All @@ -27,7 +26,7 @@ def embed(self, inputs):

class OpenAIChatProvider(ChatModelProvider):
def load_model(self):
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.client = OpenAI(api_key=get_key("OPENAI_API_KEY"))
self.encoder = tiktoken.encoding_for_model(self.name)

def chat(self, messages):
Expand Down
5 changes: 2 additions & 3 deletions latentscope/models/providers/togetherai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
import together
from .base import EmbedModelProvider

from dotenv import load_dotenv
load_dotenv()
from latentscope.util import get_key

class TogetherAIEmbedProvider(EmbedModelProvider):
def load_model(self):
together.api_key = os.getenv("TOGETHER_API_KEY")
together.api_key = get_key("TOGETHER_API_KEY")
self.client = together.Together()
self.encoder = tiktoken.encoding_for_model("text-embedding-ada-002")

Expand Down
5 changes: 2 additions & 3 deletions latentscope/models/providers/voyageai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
import voyageai
from .base import EmbedModelProvider

from dotenv import load_dotenv
load_dotenv()
from latentscope.util import get_key

class VoyageAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = voyageai.Client(os.getenv("VOYAGE_API_KEY"))
self.client = voyageai.Client(get_key("VOYAGE_API_KEY"))

def embed(self, inputs):
time.sleep(0.1) # TODO proper rate limiting
Expand Down
56 changes: 40 additions & 16 deletions latentscope/scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,49 @@
# Example: python cluster.py dadabase-curated umap-001 50 5
import os
import re
import sys
import json
import hdbscan
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from scipy.spatial.distance import cdist

from latentscope.util import get_data_dir

# TODO move this into shared space
def calculate_point_size(num_points, min_size=10, max_size=30, base_num_points=100):
"""
Calculate the size of points for a scatter plot based on the number of points.
"""
# TODO fix this to actually calculate a log scale between min and max size
if num_points <= base_num_points:
return max_size
else:
return min(min_size + min_size * np.log(num_points / base_num_points), max_size)


def main():
parser = argparse.ArgumentParser(description='Cluster UMAP embeddings')
parser.add_argument('dataset_name', type=str, help='Name of the dataset')
parser.add_argument('umap_name', type=str, help='Name of the UMAP file')
parser.add_argument('samples', type=int, help='Minimum cluster size')
parser.add_argument('min_samples', type=int, help='Minimum samples for HDBSCAN')

args = parser.parse_args()
clusterer(args.dataset_name, args.umap_name, args.samples, args.min_samples)


def clusterer(dataset_name, umap_name, samples, min_samples):
DATA_DIR = get_data_dir()
cluster_dir = os.path.join(DATA_DIR, dataset_name, "clusters")
# Check if clusters directory exists, if not, create it
if not os.path.exists(f'../data/{dataset_name}/clusters'):
os.makedirs(f'../data/{dataset_name}/clusters')
if not os.path.exists(cluster_dir):
os.makedirs(cluster_dir)
# determine the index of the last cluster run by looking in the dataset directory
# for files named umap-<number>.json
cluster_files = [f for f in os.listdir(f"../data/{dataset_name}/clusters") if re.match(r"cluster-\d+\.json", f)]
cluster_files = [f for f in os.listdir(cluster_dir) if re.match(r"cluster-\d+\.json", f)]
print("cluster files", sorted(cluster_files))
if len(cluster_files) > 0:
last_cluster = sorted(cluster_files)[-1]
Expand All @@ -31,7 +57,7 @@ def clusterer(dataset_name, umap_name, samples, min_samples):
# make the umap name from the number, zero padded to 3 digits
cluster_name = f"cluster-{next_cluster_number:03d}"

umap_embeddings_df = pd.read_parquet(f"../data/{dataset_name}/umaps/{umap_name}.parquet")
umap_embeddings_df = pd.read_parquet(os.path.join(DATA_DIR, dataset_name, "umaps", f"{umap_name}.parquet"))
umap_embeddings = umap_embeddings_df.to_numpy()

clusterer = hdbscan.HDBSCAN(min_cluster_size=samples, min_samples=min_samples, metric='euclidean')
Expand Down Expand Up @@ -63,14 +89,16 @@ def clusterer(dataset_name, umap_name, samples, min_samples):

# save umap embeddings to a parquet file with columns x,y
df = pd.DataFrame({"cluster": cluster_labels, "raw_cluster": raw_cluster_labels})
output_file = f"../data/{dataset_name}/clusters/{cluster_name}.parquet"
output_file = os.path.join(cluster_dir, f"{cluster_name}.parquet")
df.to_parquet(output_file)
print(df.head())
print("wrote", output_file)

# generate a scatterplot of the umap embeddings and save it to a file
fig, ax = plt.subplots(figsize=(6, 6))
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], s=1, alpha=0.5, c=cluster_labels, cmap='Spectral')
fig, ax = plt.subplots(figsize=(14.22, 14.22)) # 1024px by 1024px at 72 dpi
point_size = calculate_point_size(umap_embeddings.shape[0])
print("POINT SIZE", point_size, "for", umap_embeddings.shape[0], "points")
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], s=point_size, alpha=0.5, c=cluster_labels, cmap='Spectral')
# plot a convex hull around each cluster
for label in non_noise_labels:
points = umap_embeddings[cluster_labels == label]
Expand All @@ -80,9 +108,9 @@ def clusterer(dataset_name, umap_name, samples, min_samples):

plt.axis('off') # remove axis
plt.gca().set_position([0, 0, 1, 1]) # remove margins
plt.savefig(f"../data/{dataset_name}/clusters/{cluster_name}.png")
plt.savefig(os.path.join(cluster_dir, f"{cluster_name}.png"))

with open(f'../data/{dataset_name}/clusters/{cluster_name}.json', 'w') as f:
with open(os.path.join(cluster_dir,f"{cluster_name}.json"), 'w') as f:
json.dump({
"cluster_name": cluster_name,
"umap_name": umap_name,
Expand All @@ -106,12 +134,8 @@ def clusterer(dataset_name, umap_name, samples, min_samples):
slides_df = pd.concat([slides_df, new_row], ignore_index=True)

# write the df to parquet
slides_df.to_parquet(f"../data/{dataset_name}/clusters/{cluster_name}-labels.parquet")
slides_df.to_parquet(os.path.join(cluster_dir, f"{cluster_name}-labels.parquet"))
print("done")

if __name__ == "__main__":
dataset_name = sys.argv[1]
umap_name = sys.argv[2]
samples = int(sys.argv[3])
min_samples = int(sys.argv[4])
clusterer(dataset_name, umap_name, samples, min_samples)
main()
40 changes: 21 additions & 19 deletions latentscope/scripts/embed.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,31 @@
# Usage: python embed-local.py <dataset_name> <text_column> <model>
# Usage: ls-embed <dataset_name> <text_column> <model>
import os
import sys
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

# TODO is this hacky way to import from the models directory?
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from models import get_embedding_model
from latentscope.models import get_embedding_model
from latentscope.util import get_data_dir

def chunked_iterable(iterable, size):
"""Yield successive chunks from an iterable."""
for i in range(0, len(iterable), size):
yield iterable[i:i + size]

def embedder(dataset_name, text_column="text", model_id="transformers-BAAI___bge-small-en-v1.5"):
df = pd.read_parquet(f"../data/{dataset_name}/input.parquet")
def main():
parser = argparse.ArgumentParser(description='Embed a dataset')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model', type=str, help='ID of embedding model to use', default="transformers-BAAI___bge-small-en-v1.5")

# Parse arguments
args = parser.parse_args()
embed(args.name, args.text_column, args.model)

def embed(dataset_name, text_column, model_id):
DATA_DIR = get_data_dir()
df = pd.read_parquet(os.path.join(DATA_DIR, dataset_name, "input.parquet"))
sentences = df[text_column].tolist()

model = get_embedding_model(model_id)
Expand All @@ -36,19 +45,12 @@ def embedder(dataset_name, text_column="text", model_id="transformers-BAAI___bge
print("sentence embeddings:", np_embeds.shape)

# Save embeddings as a numpy file
if not os.path.exists(f'../data/{dataset_name}/embeddings'):
os.makedirs(f'../data/{dataset_name}/embeddings')
emb_dir = os.path.join(DATA_DIR, dataset_name, "embeddings")
if not os.path.exists(emb_dir):
os.makedirs(emb_dir)

np.save(f'../data/{dataset_name}/embeddings/{model_id}.npy', np_embeds)
np.save(os.path.join(DATA_DIR, dataset_name, "embeddings", f"{model_id}.npy"), np_embeds)
print("done")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Embed a dataset')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model', type=str, help='ID of embedding model to use', default="transformers-BAAI___bge-small-en-v1.5")

# Parse arguments
args = parser.parse_args()

embedder(args.name, args.text_column, args.model)
main()
13 changes: 10 additions & 3 deletions latentscope/scripts/ingest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
# Usage: python ingest.py <dataset_name>
# Usage: ls-ingest <dataset_name>
import os
import sys
import json
import argparse
import pandas as pd

from latentscope.util import get_data_dir

# TODO: somehow optionally accept a pandas dataframe as input
def main():
parser = argparse.ArgumentParser(description='Ingest a dataset')
parser.add_argument('name', type=str, help='Dataset name (directory name in data folder)')
args = parser.parse_args()
ingest(args.name)

def ingest(dataset_name):
DATA_DIR = get_data_dir()
dataset_name = sys.argv[1]
directory = os.path.join(DATA_DIR, dataset_name)
# TODO: inspect the incoming data to see if it is a csv or parquet file
csv_file = os.path.join(directory, "input.csv")
print("reading", csv_file)
df = pd.read_csv(csv_file)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,9 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

# TODO is this hacky way to import from the models directory?
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from models import get_chat_model
from latentscope.util import get_data_dir
from latentscope.models import get_chat_model

def chunked_iterable(iterable, size):
"""Yield successive chunks from an iterable."""
Expand All @@ -26,13 +22,29 @@ def too_many_duplicates(line, threshold=10):
word_count[word] = word_count.get(word, 0) + 1
return any(count > threshold for count in word_count.values())

def main():
parser = argparse.ArgumentParser(description='Label a set of slides using OpenAI')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('cluster_name', type=str, help='name of slides set', default='cluster-001')
parser.add_argument('model', type=str, help='Name of model to use', default="openai-gpt-3.5-turbo")
parser.add_argument('context', type=str, help='Additional context for labeling model', default="")

# Parse arguments
args = parser.parse_args()

labeler(args.name, args.text_column, args.cluster_name, args.model, args.context)


def labeler(dataset_name, text_column="text", cluster_name="cluster-001", model_id="gpt-3.5-turbo", context=""):
df = pd.read_parquet(f"../data/{dataset_name}/input.parquet")
# TODO This should be dropped in the preprocessing step
DATA_DIR = get_data_dir()
df = pd.read_parquet(os.path.join(DATA_DIR, dataset_name, "input.parquet"))
# TODO This should be done in the preprocessing step
df = df.reset_index(drop=True)

# Load the indices for each cluster from the prepopulated labels file generated by cluster.py
clusters = pd.read_parquet(f"../data/{dataset_name}/clusters/{cluster_name}-labels.parquet")
cluster_dir = os.path.join(DATA_DIR, dataset_name, "clusters")
clusters = pd.read_parquet(os.path.join(cluster_dir, f"{cluster_name}-labels.parquet"))

model = get_chat_model(model_id)
model.load_model()
Expand Down Expand Up @@ -104,18 +116,8 @@ def labeler(dataset_name, text_column="text", cluster_name="cluster-001", model_
clusters_df['label_raw'] = labels

# write the df to parquet
clusters_df.to_parquet(f"../data/{dataset_name}/clusters/{cluster_name}-labels-{model_id}.parquet")
clusters_df.to_parquet(os.path.join(cluster_dir, f"{cluster_name}-labels-{model_id}.parquet"))
print("done")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Label a set of slides using OpenAI')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('cluster_name', type=str, help='name of slides set', default='cluster-001')
parser.add_argument('model', type=str, help='Name of model to use', default="openai-gpt-3.5-turbo")
parser.add_argument('context', type=str, help='Additional context for labeling model', default="")

# Parse arguments
args = parser.parse_args()

labeler(args.name, args.text_column, args.cluster_name, args.model, args.context)
main()
Loading

0 comments on commit e842ebc

Please sign in to comment.