Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

experimental multilingual idea #171

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
experimental multilingual idea
  • Loading branch information
richard-rogers committed Oct 26, 2023
commit 53f66172865193869691d3a1f0c4dc89e7ae2d56
23 changes: 12 additions & 11 deletions langkit/all_metrics.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering that the modules are imported before calling init with the desired languages, does that mean that english will always be applied, and others will be additional language-specific metrics?

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import List, Optional
from whylogs.experimental.core.udf_schema import udf_schema
from whylogs.core.schema import DeclarativeSchema

Expand All @@ -13,14 +13,15 @@
from langkit import input_output


def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
injections.init(config=config)
topics.init(config=config)
regexes.init(config=config)
sentiment.init(config=config)
textstat.init(config=config)
themes.init(config=config)
toxicity.init(config=config)
input_output.init(config=config)
text_schema = udf_schema()
def init(languages: List[str] = ["en"], config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
for language in langauges:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo here? "langauges"

injections.init(language, config=config)
topics.init(language, config=config)
regexes.init(language, config=config)
sentiment.init(language, config=config)
textstat.init(language, config=config)
themes.init(language, config=config)
toxicity.init(language, config=config)
input_output.init(language, config=config)
text_schema = udf_schema(chained_schemas=languages)
return text_schema
11 changes: 7 additions & 4 deletions langkit/count_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,26 @@ def _unregister():
_registered = set()


def _register_udfs():
def _register_udfs(language: str):
global _registered
_unregister()
regex_groups = pattern_loader.get_regex_groups()
if regex_groups is not None:
for column in [prompt_column, response_column]:
for group in regex_groups:
udf_name = f"{column}.{group['name']}_count"
udf_name = f"{language}.{column}.{group['name']}_count"
register_dataset_udf(
[column],
udf_name=udf_name,
schema_name=language
)(wrapper(group, column))
_registered.add(udf_name)


def init(
pattern_file_path: Optional[str] = None, config: Optional[LangKitConfig] = None
language: str = "en",
pattern_file_path: Optional[str] = None,
config: Optional[LangKitConfig] = None
):
config = deepcopy(config or lang_config)
if pattern_file_path:
Expand All @@ -70,7 +73,7 @@ def init(
pattern_loader = PatternLoader(config)
pattern_loader.update_patterns()

_register_udfs()
_register_udfs(language)


init()
32 changes: 19 additions & 13 deletions langkit/injections.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,21 @@ def download_embeddings(url):
return array


def injection(prompt: Union[Dict[str, List], pd.DataFrame]) -> Union[List, pd.Series]:
global _transformer_model
global _index_embeddings
if _transformer_model is None:
raise ValueError("Injections - transformer model not initialized")
embeddings = _transformer_model.encode(prompt[_prompt])
faiss.normalize_L2(embeddings)
if _index_embeddings is None:
raise ValueError("Injections - index embeddings not initialized")
dists, _ = _index_embeddings.search(x=embeddings, k=1)
return dists.flatten().tolist()


def init(
language: str = "en",
transformer_name: Optional[str] = None,
version: Optional[str] = None,
config: Optional[LangKitConfig] = None,
Expand Down Expand Up @@ -73,19 +87,11 @@ def init(
f"Injections - unable to deserialize index to {embeddings_path}. Error: {deserialization_error}"
)


@register_dataset_udf([_prompt], f"{_prompt}.injection")
def injection(prompt: Union[Dict[str, List], pd.DataFrame]) -> Union[List, pd.Series]:
global _transformer_model
global _index_embeddings
if _transformer_model is None:
raise ValueError("Injections - transformer model not initialized")
embeddings = _transformer_model.encode(prompt[_prompt])
faiss.normalize_L2(embeddings)
if _index_embeddings is None:
raise ValueError("Injections - index embeddings not initialized")
dists, _ = _index_embeddings.search(x=embeddings, k=1)
return dists.flatten().tolist()
register_dataset_udf(
[_prompt],
udf_name=f"{language}{_prompt}.injection",
schema_name=language
)(injection)


init()
38 changes: 22 additions & 16 deletions langkit/input_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,6 @@
diagnostic_logger = getLogger(__name__)


def init(
transformer_name: Optional[str] = None,
custom_encoder: Optional[Callable] = None,
config: Optional[LangKitConfig] = None,
):
config = config or deepcopy(lang_config)
global _transformer_model
if transformer_name is None and custom_encoder is None:
transformer_name = config.transformer_name
_transformer_model = Encoder(transformer_name, custom_encoder)


init()


@register_dataset_udf([_prompt, _response], f"{_response}.relevance_to_{_prompt}")
def prompt_response_similarity(text):
global _transformer_model

Expand All @@ -53,3 +37,25 @@ def prompt_response_similarity(text):
)
series_result.append(None)
return series_result


def init(
language: str = "en"
transformer_name: Optional[str] = None,
custom_encoder: Optional[Callable] = None,
config: Optional[LangKitConfig] = None,
):
config = config or deepcopy(lang_config)
global _transformer_model
if transformer_name is None and custom_encoder is None:
transformer_name = config.transformer_name
_transformer_model = Encoder(transformer_name, custom_encoder)
register_dataset_udf(
[_prompt, _response],
f"{language}.{_response}.relevance_to_{_prompt}",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this renaming prefixing the language in the metric name will create a discontinuity with existing integrations and break back-compat.

We shouldn't prefix the localization in the metric name, at least not for the original english only launch of LangKit. Better would be to put this in metadata or in the platform something like column entity schema?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want, for example, to track English and French toxicity in the same column?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we could keep the original name for english, and add the language prefix only for other languages?

schema_name=language
)(prompt_response_similarity)


init()

11 changes: 6 additions & 5 deletions langkit/light_metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import List, Optional
from whylogs.experimental.core.udf_schema import udf_schema
from whylogs.core.schema import DeclarativeSchema

Expand All @@ -7,9 +7,10 @@
from langkit import textstat


def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
regexes.init(config=config)
textstat.init(config=config)
def init(languages: List[str] = ["en"], config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
for language in languages:
regexes.init(language, config=config)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like identation is wrong here

textstat.init(language, config=config)

text_schema = udf_schema()
text_schema = udf_schema(chained_schemas=languages)
return text_schema
19 changes: 10 additions & 9 deletions langkit/llm_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import LangKitConfig
from logging import getLogger
from typing import Optional
from typing import List, Optional
from whylogs.experimental.core.udf_schema import udf_schema
from whylogs.core.schema import DeclarativeSchema

Expand All @@ -19,13 +19,14 @@
)


def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
regexes.init(config=config)
sentiment.init(config=config)
textstat.init(config=config)
themes.init(config=config)
toxicity.init(config=config)
input_output.init(config=config)
def init(languages: List[str] = ["en"], config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
for language in languages:
regexes.init(language, config=config)
sentiment.init(language, config=config)
textstat.init(language, config=config)
themes.init(language, config=config)
toxicity.init(language, config=config)
input_output.init(language, config=config)

text_schema = udf_schema()
text_schema = udf_schema(chained_schemas = languages)
return text_schema
14 changes: 9 additions & 5 deletions langkit/nlp_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
_meteor_registered = False


def _register_score_udfs():
def _register_score_udfs(language: str):
global _bleu_registered, _rouge_registered, _meteor_registered

if _corpus:
Expand All @@ -30,7 +30,8 @@ def _register_score_udfs():

@register_dataset_udf(
[response_column],
udf_name=f"{response_column}.bleu_score",
udf_name=f"{language}.{response_column}.bleu_score",
schema_name=language
)
def bleu_score(text):
result = []
Expand All @@ -48,7 +49,8 @@ def bleu_score(text):

@register_dataset_udf(
[response_column],
udf_name=f"{response_column}.rouge_score",
udf_name=f"{language}.{response_column}.rouge_score",
schema_name=language
)
def rouge_score(text):
result = []
Expand All @@ -68,7 +70,8 @@ def rouge_score(text):

@register_dataset_udf(
[response_column],
udf_name=f"{response_column}.meteor_score",
udf_name=f"{language}.{response_column}.meteor_score",
schema_name=language
)
def meteor_score(text):
result = []
Expand All @@ -87,6 +90,7 @@ def meteor_score(text):


def init(
language: str = "en",
corpus: Optional[str] = None,
scores: Set[str] = set(),
rouge_type: str = "",
Expand All @@ -100,7 +104,7 @@ def init(
_scores = list(scores or config.nlp_scores)
_rouge_type = rouge_type or config.rouge_type

_register_score_udfs()
_register_score_udfs(language)


init()
8 changes: 5 additions & 3 deletions langkit/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def wrappee(text):
_registered = False


def _register_udfs():
def _register_udfs(language: str):
global _registered
if _registered:
return
Expand All @@ -48,12 +48,14 @@ def _register_udfs():
for column in [prompt_column, response_column]:
register_dataset_udf(
[column],
udf_name=f"{column}.has_patterns",
udf_name=f"{language}.{column}.has_patterns",
schema_name=language,
metrics=[MetricSpec(FrequentItemsMetric)],
)(_wrapper(column))


def init(
language: str = "en",
pattern_file_path: Optional[str] = None, config: Optional[LangKitConfig] = None
):
config = deepcopy(config or lang_config)
Expand All @@ -64,7 +66,7 @@ def init(
pattern_loader = PatternLoader(config)
pattern_loader.update_patterns()

_register_udfs()
_register_udfs(language)


init()
14 changes: 11 additions & 3 deletions langkit/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@ def sentiment_nltk(text: str) -> float:
return _sentiment_analyzer.polarity_scores(text)["compound"]


@register_dataset_udf([_prompt], udf_name=f"{_prompt}.sentiment_nltk")
def prompt_sentiment(text):
return [sentiment_nltk(t) for t in text[_prompt]]


@register_dataset_udf([_response], udf_name=f"{_response}.sentiment_nltk")
def response_sentiment(text):
return [sentiment_nltk(t) for t in text[_response]]


def init(lexicon: Optional[str] = None, config: Optional[LangKitConfig] = None):
def init(language: str = "en", lexicon: Optional[str] = None, config: Optional[LangKitConfig] = None):
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

Expand All @@ -41,6 +39,16 @@ def init(lexicon: Optional[str] = None, config: Optional[LangKitConfig] = None):
_nltk_downloaded = True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lexicon downloaded I believe is language specific, we can't just rename the metric but still download the english based corpus from nltk right? At least we should perform a check and raise an error or log a warning in many of these metrics where the existing models don't target other languages than en?


_sentiment_analyzer = SentimentIntensityAnalyzer()
register_dataset_udf(
[_prompt],
udf_name=f"{language}.{_prompt}.sentiment_nltk",
schema_name=language
)(prompt_sentiment)
register_dataset_udf(
[_response],
udf_name=f"{language}.{_response}.sentiment_nltk",
schema_name=language
)(response_sentiment)


init()
Loading