-
Notifications
You must be signed in to change notification settings - Fork 69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
experimental multilingual idea #171
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from typing import Optional | ||
from typing import List, Optional | ||
from whylogs.experimental.core.udf_schema import udf_schema | ||
from whylogs.core.schema import DeclarativeSchema | ||
|
||
|
@@ -13,14 +13,15 @@ | |
from langkit import input_output | ||
|
||
|
||
def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema: | ||
injections.init(config=config) | ||
topics.init(config=config) | ||
regexes.init(config=config) | ||
sentiment.init(config=config) | ||
textstat.init(config=config) | ||
themes.init(config=config) | ||
toxicity.init(config=config) | ||
input_output.init(config=config) | ||
text_schema = udf_schema() | ||
def init(languages: List[str] = ["en"], config: Optional[LangKitConfig] = None) -> DeclarativeSchema: | ||
for language in langauges: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo here? "langauges" |
||
injections.init(language, config=config) | ||
topics.init(language, config=config) | ||
regexes.init(language, config=config) | ||
sentiment.init(language, config=config) | ||
textstat.init(language, config=config) | ||
themes.init(language, config=config) | ||
toxicity.init(language, config=config) | ||
input_output.init(language, config=config) | ||
text_schema = udf_schema(chained_schemas=languages) | ||
return text_schema |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,22 +16,6 @@ | |
diagnostic_logger = getLogger(__name__) | ||
|
||
|
||
def init( | ||
transformer_name: Optional[str] = None, | ||
custom_encoder: Optional[Callable] = None, | ||
config: Optional[LangKitConfig] = None, | ||
): | ||
config = config or deepcopy(lang_config) | ||
global _transformer_model | ||
if transformer_name is None and custom_encoder is None: | ||
transformer_name = config.transformer_name | ||
_transformer_model = Encoder(transformer_name, custom_encoder) | ||
|
||
|
||
init() | ||
|
||
|
||
@register_dataset_udf([_prompt, _response], f"{_response}.relevance_to_{_prompt}") | ||
def prompt_response_similarity(text): | ||
global _transformer_model | ||
|
||
|
@@ -53,3 +37,25 @@ def prompt_response_similarity(text): | |
) | ||
series_result.append(None) | ||
return series_result | ||
|
||
|
||
def init( | ||
language: str = "en" | ||
transformer_name: Optional[str] = None, | ||
custom_encoder: Optional[Callable] = None, | ||
config: Optional[LangKitConfig] = None, | ||
): | ||
config = config or deepcopy(lang_config) | ||
global _transformer_model | ||
if transformer_name is None and custom_encoder is None: | ||
transformer_name = config.transformer_name | ||
_transformer_model = Encoder(transformer_name, custom_encoder) | ||
register_dataset_udf( | ||
[_prompt, _response], | ||
f"{language}.{_response}.relevance_to_{_prompt}", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this renaming prefixing the language in the metric name will create a discontinuity with existing integrations and break back-compat. We shouldn't prefix the localization in the metric name, at least not for the original english only launch of LangKit. Better would be to put this in metadata or in the platform something like column entity schema? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want, for example, to track English and French toxicity in the same column? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we could keep the original name for english, and add the language prefix only for other languages? |
||
schema_name=language | ||
)(prompt_response_similarity) | ||
|
||
|
||
init() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from typing import Optional | ||
from typing import List, Optional | ||
from whylogs.experimental.core.udf_schema import udf_schema | ||
from whylogs.core.schema import DeclarativeSchema | ||
|
||
|
@@ -7,9 +7,10 @@ | |
from langkit import textstat | ||
|
||
|
||
def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema: | ||
regexes.init(config=config) | ||
textstat.init(config=config) | ||
def init(languages: List[str] = ["en"], config: Optional[LangKitConfig] = None) -> DeclarativeSchema: | ||
for language in languages: | ||
regexes.init(language, config=config) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like identation is wrong here |
||
textstat.init(language, config=config) | ||
|
||
text_schema = udf_schema() | ||
text_schema = udf_schema(chained_schemas=languages) | ||
return text_schema |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,17 +19,15 @@ def sentiment_nltk(text: str) -> float: | |
return _sentiment_analyzer.polarity_scores(text)["compound"] | ||
|
||
|
||
@register_dataset_udf([_prompt], udf_name=f"{_prompt}.sentiment_nltk") | ||
def prompt_sentiment(text): | ||
return [sentiment_nltk(t) for t in text[_prompt]] | ||
|
||
|
||
@register_dataset_udf([_response], udf_name=f"{_response}.sentiment_nltk") | ||
def response_sentiment(text): | ||
return [sentiment_nltk(t) for t in text[_response]] | ||
|
||
|
||
def init(lexicon: Optional[str] = None, config: Optional[LangKitConfig] = None): | ||
def init(language: str = "en", lexicon: Optional[str] = None, config: Optional[LangKitConfig] = None): | ||
import nltk | ||
from nltk.sentiment import SentimentIntensityAnalyzer | ||
|
||
|
@@ -41,6 +39,16 @@ def init(lexicon: Optional[str] = None, config: Optional[LangKitConfig] = None): | |
_nltk_downloaded = True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
||
_sentiment_analyzer = SentimentIntensityAnalyzer() | ||
register_dataset_udf( | ||
[_prompt], | ||
udf_name=f"{language}.{_prompt}.sentiment_nltk", | ||
schema_name=language | ||
)(prompt_sentiment) | ||
register_dataset_udf( | ||
[_response], | ||
udf_name=f"{language}.{_response}.sentiment_nltk", | ||
schema_name=language | ||
)(response_sentiment) | ||
|
||
|
||
init() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Considering that the modules are imported before calling init with the desired languages, does that mean that english will always be applied, and others will be additional language-specific metrics?