Skip to content

Commit

Permalink
Merge pull request #30 from jalammar/hf4-support
Browse files Browse the repository at this point in the history
Hf4 support
  • Loading branch information
jalammar authored Feb 8, 2021
2 parents 3f6b448 + 2bd80e5 commit e4ad283
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 25 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ numpy~=1.19.1
ipython~=7.16.1
scikit-learn~=0.23.2
seaborn~=0.11.0
transformers~=3.1.0
transformers~=4.2.2
pytest~=6.1.2
setuptools~=49.6.0
torch~=1.6.0
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def read(*names, **kwargs):

setup(
name='ecco',
version='0.0.11',
version='0.0.13',
license='BSD-3-Clause',
description='Visualization tools for NLP machine learning models.',
long_description='%s\n%s' % (
Expand Down Expand Up @@ -64,7 +64,7 @@ def read(*names, **kwargs):
],
python_requires='!=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
install_requires=[
"transformers < 3.5",
"transformers ~= 4.2",
"seaborn ~= 0.11",
"scikit-learn~=0.23"
],
Expand Down
2 changes: 1 addition & 1 deletion src/ecco/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.0.11'
__version__ = '0.0.13'
from ecco.lm import LM, MockGPT, MockGPTTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

Expand Down
17 changes: 15 additions & 2 deletions src/ecco/lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import json
from ecco.attribution import *
from typing import Optional, Any
from transformers.modeling_gpt2 import GPT2Model
from transformers import GPT2Model


def sample_output_token(scores, do_sample, temperature, top_k, top_p):
Expand Down Expand Up @@ -131,7 +131,20 @@ def _generate_token(self, input_ids, past, do_sample: bool, temperature: float,
# detach(): don't need grads here
# cpu(): not used by GPU during generation; may lead to GPU OOM if left on GPU during long generations
if getattr(output, "hidden_states", None) is not None:
output.hidden_states = tuple([h.cpu().detach() for h in output.hidden_states])
hs_list = []
for idx, layer_hs in enumerate(output.hidden_states):
# in Hugging Face Transformers v4, there's an extra index for batch
if len(layer_hs.shape) == 3: # If there's a batch dimension, pick the first oen
hs = layer_hs.cpu().detach()[0].unsqueeze(0) # Adding a dimension to concat to later
# Earlier versions are only 2 dimensional
# But also, in v4, for GPT2, all except the last one would have 3 dims, the last layer
# would only have two dims
else:
hs = layer_hs.cpu().detach().unsqueeze(0)

hs_list.append(hs)

output.hidden_states = torch.cat(hs_list, dim=0)

return prediction_id, output

Expand Down
56 changes: 37 additions & 19 deletions src/ecco/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,31 @@ def __init__(self,
hidden_states=None,
attribution=None,
activations=None,
activations_type=None,
collect_activations_layer_nums=None,
attention=None,
model_outputs=None,
lm_head=None,
device='cpu'):
"""
Args:
token_ids: List of token ids
n_input_tokens: Int. The number of input tokens in the sequence.
tokenizer: huggingface tokenizer associated with the model generating this output
output_text: The output text generated by the model (if processed with generate())
tokens: A list of token text. Shorthand to passing the token ids by the tokenizer
hidden_states: A tensor of dimensions (layer, position, hidden_dimension).
In layer, index 0 is for embedding hidden_state.
attribution: A list of attributions. One element per generated token.
Each element is a list giving a value for tokens from 0 to right before the generated token.
activations: The activations collected from model processing.
Shape is (batch, layer, neurons, position)
collect_activations_layer_nums:
attention: The attention tensor retrieved from the language model
model_outputs: Raw return object returned by the model
lm_head: The trained language model head from a language model projecting a
hidden state to an output vocabulary associated with teh tokenizer.
device: "cuda" or "cpu"
"""
self.token_ids = token_ids
self.tokenizer = tokenizer
self.n_input_tokens = n_input_tokens
Expand All @@ -36,7 +55,6 @@ def __init__(self,
self.hidden_states = hidden_states
self.attribution = attribution
self.activations = activations
self.activations_type = activations_type
self.collect_activations_layer_nums = collect_activations_layer_nums
self.model_outputs = model_outputs
self.attention_values = attention
Expand Down Expand Up @@ -227,7 +245,7 @@ def plot_feature_importance_barplots(self):
# print(i.numpy())
plt.show()

def layer_predictions(self, position: int = 0, topk: Optional[int] = 10, layer: Optional[int] = None, **kwargs):
def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer: Optional[int] = None, **kwargs):
"""
Visualization plotting the topk predicted tokens after each layer (using its hidden state).
:param output: OutputSeq object generated by LM.generate()
Expand All @@ -236,32 +254,37 @@ def layer_predictions(self, position: int = 0, topk: Optional[int] = 10, layer:
:param layer: None shows all layers. Can also pass an int with the layer id to show only that layer
"""

watch = self.to(torch.tensor([self.token_ids[self.n_input_tokens]]))
hidden_states = self.hidden_states

if position == 0:
raise ValueError(f"'position' is set to 0. There is never a hidden state associated with this position."
f"Possible values are 1 and above -- the position of the token of interest in the sequence")
# watch = self.to(torch.tensor([self.token_ids[self.n_input_tokens]]))
# There is one lm output per generated token. To get the index
output_index = position - self.n_input_tokens
if layer is not None:
hidden_states = self.hidden_states[layer + 1].unsqueeze(0)
# If a layer is specified, choose it only.
hidden_states = hidden_states[layer + 1].unsqueeze(0)
else:
hidden_states = self.hidden_states[1:] # Ignore the first element (embedding)
# include all layers except the first
hidden_states = hidden_states[1:]

k = topk
top_tokens = []
probs = []
data = []

print('Predictions for position {}'.format(position))
for layer_no, h in enumerate(hidden_states):
# print(h.shape)
hidden_state = h[position - 1]
# Use lm_head to project the layer's hidden state to output vocabulary
logits = self.lm_head(self.to(hidden_state))
softmax = F.softmax(logits, dim=-1)
# softmax dims are (number of words in vocab) - 50257 in GPT2
sorted_softmax = self.to(torch.argsort(softmax))

# Not currently used. If we're "watching" a specific token, this gets its ranking
# idx = sorted_softmax.shape[0] - torch.nonzero((sorted_softmax == watch)).flatten()

layer_top_tokens = [self.tokenizer.decode([t]) for t in sorted_softmax[-k:]][::-1]
layer_top_tokens = [self.tokenizer.decode(t) for t in sorted_softmax[-k:]][::-1]
top_tokens.append(layer_top_tokens)
layer_probs = softmax[sorted_softmax[-k:]].cpu().detach().numpy()[::-1]
probs.append(layer_probs.tolist())
Expand Down Expand Up @@ -306,11 +329,11 @@ def rankings(self, **kwargs):
Plots the rankings (across layers) of the tokens the model selected.
Each column is a position in the sequence. Each row is a layer.
"""

hidden_states = self.hidden_states

n_layers = len(hidden_states)
position = hidden_states[0].shape[0] - self.n_input_tokens + 1
# print('position', position)

predicted_tokens = np.empty((n_layers - 1, position), dtype='U25')
rankings = np.zeros((n_layers - 1, position), dtype=np.int32)
Expand All @@ -320,32 +343,27 @@ def rankings(self, **kwargs):
for i, level in enumerate(hidden_states[1:]):
# Loop through generated/output positions
for j, hidden_state in enumerate(level[self.n_input_tokens - 1:]):
# print('hidden state layer', i, 'position', self.n_input_tokens-1+j)
# Project hidden state to vocabulary
# (after debugging pain: ensure input is on GPU, if appropriate)
logits = self.lm_head(self.to(hidden_state))
# logits = self.lm_head(torch.tensor(hidden_state))
# Sort by score (ascending)
sorted = torch.argsort(logits)
# What token was sampled in this position?

token_id = torch.tensor(self.token_ids[self.n_input_tokens + j])
# print('token_id', token_id)
# token_id = self.token_ids.clone().detach()[self.n_input_tokens + j]
# What's the index of the sampled token in the sorted list?
r = torch.nonzero((sorted == token_id)).flatten()
# subtract to get ranking (where 1 is the top scoring, because sorting was in ascending order)
ranking = sorted.shape[0] - r
# print('ranking', ranking)
# token_id = torch.argmax(sm)
token = self.tokenizer.decode([token_id])
predicted_tokens[i, j] = token
rankings[i, j] = int(ranking)
# print('layer', i, 'position', j, 'top1', token_id, 'actual label', output['token_ids'][j]+1)
if token_id == self.token_ids[j + 1]:
token_found_mask[i, j] = 0

input_tokens = [repr(t) for t in self.tokens[self.n_input_tokens - 1:-1]]
output_tokens = [repr(t) for t in self.tokens[self.n_input_tokens:]]
# print('in out', input_tokens, output_tokens)
lm_plots.plot_inner_token_rankings(input_tokens,
output_tokens,
rankings,
Expand All @@ -367,12 +385,12 @@ def rankings_watch(self, watch: List[int] = None, position: int = -1, **kwargs):
if position != -1:
position = position - 1 # e.g. position 5 corresponds to activation 4


hidden_states = self.hidden_states

n_layers = len(hidden_states)
n_tokens_to_watch = len(watch)

# predicted_tokens = np.empty((n_layers - 1, n_tokens_to_watch), dtype='U25')
rankings = np.zeros((n_layers - 1, n_tokens_to_watch), dtype=np.int32)

# loop through layer levels
Expand Down
10 changes: 10 additions & 0 deletions tests/output_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def test_saliency(self, output_seq_1):

assert actual == expected


def test_layer_position_zero_raises_valueerror(self, output_seq_1):
with pytest.raises(ValueError, match=r".* set to 0*") as ex:
actual = output_seq_1.layer_predictions(position=0)

def test_layer_predictions_all_layers(self, output_seq_1):
actual = output_seq_1.layer_predictions(printJson=True)
assert len(actual) == 6 # an array for each layer
Expand Down Expand Up @@ -79,6 +84,11 @@ def test_nmf_raises_value_error_layer_bounds(self):
from_layer=1,
to_layer=0)

# def test_rankings_watch_success_1(self, output_seq_1):
# actual = output_seq_1.rankings_watch(watch=[0,1], printJson=True)
# print(actual)
# assert False


@pytest.fixture
def output_seq_1():
Expand Down

0 comments on commit e4ad283

Please sign in to comment.