From e8344163bffed157c7775bab5e8a3ab462e937a2 Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sat, 30 Jan 2021 08:50:33 +0300 Subject: [PATCH 1/8] - Adding support for huggingface transformers version 4 - Added assertion and test for layer_predictions() position 0 --- requirements.txt | 2 +- setup.py | 4 ++-- src/ecco/__init__.py | 2 +- src/ecco/lm.py | 2 +- src/ecco/output.py | 5 ++++- tests/output_test.py | 5 +++++ 6 files changed, 14 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 80ad44a..28f4bd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ numpy~=1.19.1 ipython~=7.16.1 scikit-learn~=0.23.2 seaborn~=0.11.0 -transformers~=3.1.0 +transformers~=4.2.2 pytest~=6.1.2 setuptools~=49.6.0 torch~=1.6.0 diff --git a/setup.py b/setup.py index e46dfc2..b1471f5 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def read(*names, **kwargs): setup( name='ecco', - version='0.0.11', + version='0.0.13', license='BSD-3-Clause', description='Visualization tools for NLP machine learning models.', long_description='%s\n%s' % ( @@ -64,7 +64,7 @@ def read(*names, **kwargs): ], python_requires='!=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*', install_requires=[ - "transformers < 3.5", + "transformers ~= 4", "seaborn ~= 0.11", "scikit-learn~=0.23" ], diff --git a/src/ecco/__init__.py b/src/ecco/__init__.py index 63543e0..ee92e25 100644 --- a/src/ecco/__init__.py +++ b/src/ecco/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.0.11' +__version__ = '0.0.13' from ecco.lm import LM, MockGPT, MockGPTTokenizer from transformers import AutoTokenizer, AutoModelForCausalLM diff --git a/src/ecco/lm.py b/src/ecco/lm.py index ff0162d..0e068b5 100644 --- a/src/ecco/lm.py +++ b/src/ecco/lm.py @@ -10,7 +10,7 @@ import json from ecco.attribution import * from typing import Optional, Any -from transformers.modeling_gpt2 import GPT2Model +from transformers import GPT2Model def sample_output_token(scores, do_sample, temperature, top_k, top_p): diff --git a/src/ecco/output.py b/src/ecco/output.py index 58c3012..c1a8cb8 100644 --- a/src/ecco/output.py +++ b/src/ecco/output.py @@ -227,7 +227,7 @@ def plot_feature_importance_barplots(self): # print(i.numpy()) plt.show() - def layer_predictions(self, position: int = 0, topk: Optional[int] = 10, layer: Optional[int] = None, **kwargs): + def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer: Optional[int] = None, **kwargs): """ Visualization plotting the topk predicted tokens after each layer (using its hidden state). :param output: OutputSeq object generated by LM.generate() @@ -236,6 +236,9 @@ def layer_predictions(self, position: int = 0, topk: Optional[int] = 10, layer: :param layer: None shows all layers. Can also pass an int with the layer id to show only that layer """ + if position == 0: + raise ValueError(f"'position' is set to 0. There is never a hidden state associated with this position." + f"Possible values are 1 and above -- the position of the token of interest in the sequence") watch = self.to(torch.tensor([self.token_ids[self.n_input_tokens]])) # There is one lm output per generated token. To get the index output_index = position - self.n_input_tokens diff --git a/tests/output_test.py b/tests/output_test.py index 04b8d9e..2526742 100644 --- a/tests/output_test.py +++ b/tests/output_test.py @@ -30,6 +30,11 @@ def test_saliency(self, output_seq_1): assert actual == expected + + def test_layer_position_zero_raises_valueerror(self, output_seq_1): + with pytest.raises(ValueError, match=r".* set to 0*") as ex: + actual = output_seq_1.layer_predictions(position=0) + def test_layer_predictions_all_layers(self, output_seq_1): actual = output_seq_1.layer_predictions(printJson=True) assert len(actual) == 6 # an array for each layer From db02a4df9c927124d72a9fafe963fb97db00fa70 Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sat, 30 Jan 2021 08:55:06 +0300 Subject: [PATCH 2/8] - Adjusted version numbers --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 28f4bd2..abb5e88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ seaborn~=0.11.0 transformers~=4.2.2 pytest~=6.1.2 setuptools~=49.6.0 -torch~=1.6.0 -torchvision~=0.7.0 +torch~=1.7.1 +torchvision~=0.8.2 From de8cee40d074cb90b86046dbf2f1a744986b3983 Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sat, 30 Jan 2021 09:00:18 +0300 Subject: [PATCH 3/8] - Adjusted version numbers --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b1471f5..034255b 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ def read(*names, **kwargs): ], python_requires='!=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*', install_requires=[ - "transformers ~= 4", + "transformers ~= 4.2", "seaborn ~= 0.11", "scikit-learn~=0.23" ], From d3c60dadd064daaf8c4559cb6867423fb643ba8a Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sat, 30 Jan 2021 09:12:51 +0300 Subject: [PATCH 4/8] - Adjusted version numbers --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index abb5e88..28f4bd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ seaborn~=0.11.0 transformers~=4.2.2 pytest~=6.1.2 setuptools~=49.6.0 -torch~=1.7.1 -torchvision~=0.8.2 +torch~=1.6.0 +torchvision~=0.7.0 From 6db39e10b18a4a16bf17cf1274b7812311d3f6f8 Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sat, 30 Jan 2021 09:49:15 +0300 Subject: [PATCH 5/8] - To support huggingface transformers 4, add support for the additional dimension added to hiddenstates. --- src/ecco/output.py | 25 ++++++++++++++++++++----- tests/output_test.py | 5 +++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/ecco/output.py b/src/ecco/output.py index c1a8cb8..fa88ffa 100644 --- a/src/ecco/output.py +++ b/src/ecco/output.py @@ -235,6 +235,12 @@ def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer: :param topk: Number of tokens to show for each layer :param layer: None shows all layers. Can also pass an int with the layer id to show only that layer """ + # Starting from huggingface transformers v4, the shape + # 3 dimensional instead of 2 + if self.hidden_states.shape == 3: + hidden_states = self.hidden_states[0] + else: # To support huggingface transformers v. 3 + hidden_states = self.hidden_states if position == 0: raise ValueError(f"'position' is set to 0. There is never a hidden state associated with this position." @@ -243,9 +249,9 @@ def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer: # There is one lm output per generated token. To get the index output_index = position - self.n_input_tokens if layer is not None: - hidden_states = self.hidden_states[layer + 1].unsqueeze(0) + hidden_states = hidden_states[layer + 1].unsqueeze(0) else: - hidden_states = self.hidden_states[1:] # Ignore the first element (embedding) + hidden_states = hidden_states[1:] # Ignore the first element (embedding) k = topk top_tokens = [] @@ -309,7 +315,12 @@ def rankings(self, **kwargs): Plots the rankings (across layers) of the tokens the model selected. Each column is a position in the sequence. Each row is a layer. """ - hidden_states = self.hidden_states + # Starting from huggingface transformers v4, the shape + # 3 dimensional instead of 2 + if self.hidden_states.shape == 3: + hidden_states = self.hidden_states[0] + else: # To support huggingface transformers v. 3 + hidden_states = self.hidden_states n_layers = len(hidden_states) position = hidden_states[0].shape[0] - self.n_input_tokens + 1 @@ -370,12 +381,16 @@ def rankings_watch(self, watch: List[int] = None, position: int = -1, **kwargs): if position != -1: position = position - 1 # e.g. position 5 corresponds to activation 4 - hidden_states = self.hidden_states + # Starting from huggingface transformers v4, the shape + # 3 dimensional instead of 2 + if self.hidden_states.shape == 3: + hidden_states = self.hidden_states[0] + else: # To support huggingface transformers v. 3 + hidden_states = self.hidden_states n_layers = len(hidden_states) n_tokens_to_watch = len(watch) - # predicted_tokens = np.empty((n_layers - 1, n_tokens_to_watch), dtype='U25') rankings = np.zeros((n_layers - 1, n_tokens_to_watch), dtype=np.int32) # loop through layer levels diff --git a/tests/output_test.py b/tests/output_test.py index 2526742..f5efa3c 100644 --- a/tests/output_test.py +++ b/tests/output_test.py @@ -84,6 +84,11 @@ def test_nmf_raises_value_error_layer_bounds(self): from_layer=1, to_layer=0) + # def test_rankings_watch_success_1(self, output_seq_1): + # actual = output_seq_1.rankings_watch(watch=[0,1], printJson=True) + # print(actual) + # assert False + @pytest.fixture def output_seq_1(): From 17f9c9214d23c0e84bec3c8b23fb037f647813aa Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sat, 30 Jan 2021 10:05:51 +0300 Subject: [PATCH 6/8] - Rolled back premature hotfix. Will dig deeper later. Just some tensor plumming needed for the hidden_states object --- src/ecco/output.py | 47 ++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/ecco/output.py b/src/ecco/output.py index fa88ffa..b9c9c92 100644 --- a/src/ecco/output.py +++ b/src/ecco/output.py @@ -22,12 +22,32 @@ def __init__(self, hidden_states=None, attribution=None, activations=None, - activations_type=None, collect_activations_layer_nums=None, attention=None, model_outputs=None, lm_head=None, device='cpu'): + """ + Args: + token_ids: List of token ids + n_input_tokens: Int. The number of input tokens in the sequence. + tokenizer: huggingface tokenizer associated with the model generating this output + output_text: The output text generated by the model (if processed with generate()) + tokens: A list of token text. Shorthand to passing the token ids by the tokenizer + hidden_states: A list of with "laeyers + 1" elements. Each element + has dimsions (batch, position, hidden_dimension) if huggingface transformers v4 + or (position, hidden_dimension) if v3 + attribution: A list of attributions. One element per generated token. + Each element is a list giving a value for tokens from 0 to right before the generated token. + activations: The activations collected from model processing. + Shape is (batch, layer, neurons, position) + collect_activations_layer_nums: + attention: The attention tensor retrieved from the language model + model_outputs: Raw return object returned by the model + lm_head: The trained language model head from a language model projecting a + hidden state to an output vocabulary associated with teh tokenizer. + device: "cuda" or "cpu" + """ self.token_ids = token_ids self.tokenizer = tokenizer self.n_input_tokens = n_input_tokens @@ -36,7 +56,6 @@ def __init__(self, self.hidden_states = hidden_states self.attribution = attribution self.activations = activations - self.activations_type = activations_type self.collect_activations_layer_nums = collect_activations_layer_nums self.model_outputs = model_outputs self.attention_values = attention @@ -235,12 +254,8 @@ def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer: :param topk: Number of tokens to show for each layer :param layer: None shows all layers. Can also pass an int with the layer id to show only that layer """ - # Starting from huggingface transformers v4, the shape - # 3 dimensional instead of 2 - if self.hidden_states.shape == 3: - hidden_states = self.hidden_states[0] - else: # To support huggingface transformers v. 3 - hidden_states = self.hidden_states + + hidden_states = self.hidden_states if position == 0: raise ValueError(f"'position' is set to 0. There is never a hidden state associated with this position." @@ -315,12 +330,8 @@ def rankings(self, **kwargs): Plots the rankings (across layers) of the tokens the model selected. Each column is a position in the sequence. Each row is a layer. """ - # Starting from huggingface transformers v4, the shape - # 3 dimensional instead of 2 - if self.hidden_states.shape == 3: - hidden_states = self.hidden_states[0] - else: # To support huggingface transformers v. 3 - hidden_states = self.hidden_states + + hidden_states = self.hidden_states n_layers = len(hidden_states) position = hidden_states[0].shape[0] - self.n_input_tokens + 1 @@ -381,12 +392,8 @@ def rankings_watch(self, watch: List[int] = None, position: int = -1, **kwargs): if position != -1: position = position - 1 # e.g. position 5 corresponds to activation 4 - # Starting from huggingface transformers v4, the shape - # 3 dimensional instead of 2 - if self.hidden_states.shape == 3: - hidden_states = self.hidden_states[0] - else: # To support huggingface transformers v. 3 - hidden_states = self.hidden_states + + hidden_states = self.hidden_states n_layers = len(hidden_states) n_tokens_to_watch = len(watch) From cad51f33f8251d300864d42debb9453884ce80b7 Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sun, 7 Feb 2021 14:36:29 +0300 Subject: [PATCH 7/8] - LM: Collecting hidden states in LM is now able to deal with hidden states with a batch dimension. It just selects the first element in the batch. - LM: hidden_states passed to OutputSeq is now a tensor instead of a tuple. Much easier to deal with. Shape: (layer, position, d_model) --- src/ecco/lm.py | 15 ++++++++++++++- src/ecco/output.py | 27 +++++++++------------------ 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/ecco/lm.py b/src/ecco/lm.py index 0e068b5..fc5df47 100644 --- a/src/ecco/lm.py +++ b/src/ecco/lm.py @@ -131,7 +131,20 @@ def _generate_token(self, input_ids, past, do_sample: bool, temperature: float, # detach(): don't need grads here # cpu(): not used by GPU during generation; may lead to GPU OOM if left on GPU during long generations if getattr(output, "hidden_states", None) is not None: - output.hidden_states = tuple([h.cpu().detach() for h in output.hidden_states]) + hs_list = [] + for idx, layer_hs in enumerate(output.hidden_states): + # in Hugging Face Transformers v4, there's an extra index for batch + if len(layer_hs.shape) == 3: # If there's a batch dimension, pick the first oen + hs = layer_hs.cpu().detach()[0].unsqueeze(0) # Adding a dimension to concat to later + # Earlier versions are only 2 dimensional + # But also, in v4, for GPT2, all except the last one would have 3 dims, the last layer + # would only have two dims + else: + hs = layer_hs.cpu().detach().unsqueeze(0) + + hs_list.append(hs) + + output.hidden_states = torch.cat(hs_list, dim=0) return prediction_id, output diff --git a/src/ecco/output.py b/src/ecco/output.py index b9c9c92..7d032ed 100644 --- a/src/ecco/output.py +++ b/src/ecco/output.py @@ -34,9 +34,8 @@ def __init__(self, tokenizer: huggingface tokenizer associated with the model generating this output output_text: The output text generated by the model (if processed with generate()) tokens: A list of token text. Shorthand to passing the token ids by the tokenizer - hidden_states: A list of with "laeyers + 1" elements. Each element - has dimsions (batch, position, hidden_dimension) if huggingface transformers v4 - or (position, hidden_dimension) if v3 + hidden_states: A tensor of dimensions (layer, position, hidden_dimension). + In layer, index 0 is for embedding hidden_state. attribution: A list of attributions. One element per generated token. Each element is a list giving a value for tokens from 0 to right before the generated token. activations: The activations collected from model processing. @@ -260,32 +259,32 @@ def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer: if position == 0: raise ValueError(f"'position' is set to 0. There is never a hidden state associated with this position." f"Possible values are 1 and above -- the position of the token of interest in the sequence") - watch = self.to(torch.tensor([self.token_ids[self.n_input_tokens]])) + # watch = self.to(torch.tensor([self.token_ids[self.n_input_tokens]])) # There is one lm output per generated token. To get the index output_index = position - self.n_input_tokens if layer is not None: + # If a layer is specified, choose it only. hidden_states = hidden_states[layer + 1].unsqueeze(0) else: - hidden_states = hidden_states[1:] # Ignore the first element (embedding) + # include all layers except the first + hidden_states = hidden_states[1:] k = topk top_tokens = [] probs = [] data = [] - print('Predictions for position {}'.format(position)) for layer_no, h in enumerate(hidden_states): - # print(h.shape) hidden_state = h[position - 1] # Use lm_head to project the layer's hidden state to output vocabulary logits = self.lm_head(self.to(hidden_state)) softmax = F.softmax(logits, dim=-1) + # softmax dims are (number of words in vocab) - 50257 in GPT2 sorted_softmax = self.to(torch.argsort(softmax)) - # Not currently used. If we're "watching" a specific token, this gets its ranking # idx = sorted_softmax.shape[0] - torch.nonzero((sorted_softmax == watch)).flatten() - layer_top_tokens = [self.tokenizer.decode([t]) for t in sorted_softmax[-k:]][::-1] + layer_top_tokens = [self.tokenizer.decode(t) for t in sorted_softmax[-k:]][::-1] top_tokens.append(layer_top_tokens) layer_probs = softmax[sorted_softmax[-k:]].cpu().detach().numpy()[::-1] probs.append(layer_probs.tolist()) @@ -335,7 +334,6 @@ def rankings(self, **kwargs): n_layers = len(hidden_states) position = hidden_states[0].shape[0] - self.n_input_tokens + 1 - # print('position', position) predicted_tokens = np.empty((n_layers - 1, position), dtype='U25') rankings = np.zeros((n_layers - 1, position), dtype=np.int32) @@ -345,32 +343,25 @@ def rankings(self, **kwargs): for i, level in enumerate(hidden_states[1:]): # Loop through generated/output positions for j, hidden_state in enumerate(level[self.n_input_tokens - 1:]): - # print('hidden state layer', i, 'position', self.n_input_tokens-1+j) # Project hidden state to vocabulary # (after debugging pain: ensure input is on GPU, if appropriate) logits = self.lm_head(self.to(hidden_state)) - # logits = self.lm_head(torch.tensor(hidden_state)) # Sort by score (ascending) sorted = torch.argsort(logits) # What token was sampled in this position? - token_id = torch.tensor(self.token_ids[self.n_input_tokens + j]) - # print('token_id', token_id) + token_id = self.token_ids[self.n_input_tokens + j].clone().detach() # What's the index of the sampled token in the sorted list? r = torch.nonzero((sorted == token_id)).flatten() # subtract to get ranking (where 1 is the top scoring, because sorting was in ascending order) ranking = sorted.shape[0] - r - # print('ranking', ranking) - # token_id = torch.argmax(sm) token = self.tokenizer.decode([token_id]) predicted_tokens[i, j] = token rankings[i, j] = int(ranking) - # print('layer', i, 'position', j, 'top1', token_id, 'actual label', output['token_ids'][j]+1) if token_id == self.token_ids[j + 1]: token_found_mask[i, j] = 0 input_tokens = [repr(t) for t in self.tokens[self.n_input_tokens - 1:-1]] output_tokens = [repr(t) for t in self.tokens[self.n_input_tokens:]] - # print('in out', input_tokens, output_tokens) lm_plots.plot_inner_token_rankings(input_tokens, output_tokens, rankings, From 2bd80e5a64b60aeecec690577df2de40dfeab14a Mon Sep 17 00:00:00 2001 From: Jay Alammar Date: Sun, 7 Feb 2021 14:44:19 +0300 Subject: [PATCH 8/8] - --- src/ecco/output.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ecco/output.py b/src/ecco/output.py index 7d032ed..c1d5ae1 100644 --- a/src/ecco/output.py +++ b/src/ecco/output.py @@ -349,7 +349,9 @@ def rankings(self, **kwargs): # Sort by score (ascending) sorted = torch.argsort(logits) # What token was sampled in this position? - token_id = self.token_ids[self.n_input_tokens + j].clone().detach() + + token_id = torch.tensor(self.token_ids[self.n_input_tokens + j]) + # token_id = self.token_ids.clone().detach()[self.n_input_tokens + j] # What's the index of the sampled token in the sorted list? r = torch.nonzero((sorted == token_id)).flatten() # subtract to get ranking (where 1 is the top scoring, because sorting was in ascending order)