attention_utils.py

"""Collection of utilities for attention intervention"""

import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
import torch

from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind


def perform_intervention(intervention, model, effect_types=('indirect', 'direct')):
	"""Perform intervention and return results for specified effects"""
	x = intervention.base_strings_tok
	x_alt = intervention.alt_strings_tok  
	with torch.no_grad():
		candidate1_base_prob, candidate2_base_prob = model.get_probabilities_for_examples_multitoken(
			x,
			intervention.candidates_tok)
		candidate1_alt_prob, candidate2_alt_prob = model.get_probabilities_for_examples_multitoken(
			x_alt,
			intervention.candidates_tok)

	candidate1 = ' '.join(intervention.candidates[0]).replace('Ġ', '')
	candidate2 = ' '.join(intervention.candidates[1]).replace('Ġ', '')

	odds_base = candidate2_base_prob / candidate1_base_prob
	odds_alt = candidate2_alt_prob / candidate1_alt_prob
	total_effect = (odds_alt - odds_base) / odds_base

	results = {
		'base_string1': intervention.base_strings[0],
		'base_string2': intervention.base_strings[1],
		'candidate1': candidate1,
		'candidate2': candidate2,
		'candidate1_base_prob': candidate1_base_prob,
		'candidate2_base_prob': candidate2_base_prob,
		'odds_base': odds_base,
		'candidate1_alt_prob': candidate1_alt_prob,
		'candidate2_alt_prob': candidate2_alt_prob,
		'odds_alt': odds_alt,
		'total_effect': total_effect,
	}

	for effect_type in effect_types:
		candidate1_probs_head, candidate2_probs_head, candidate1_probs_layer, candidate2_probs_layer,\
			candidate1_probs_model, candidate2_probs_model = model.attention_intervention_experiment(
			intervention, effect_type)
		odds_intervention_head = candidate2_probs_head / candidate1_probs_head
		odds_intervention_layer = candidate2_probs_layer / candidate1_probs_layer
		odds_intervention_model = candidate2_probs_model / candidate1_probs_model
		effect_head = (odds_intervention_head - odds_base) / odds_base
		effect_layer = (odds_intervention_layer - odds_base) / odds_base
		effect_model = (odds_intervention_model - odds_base) / odds_base

		results[effect_type + "_odds_head"] = odds_intervention_head.tolist()
		results[effect_type + "_effect_head"] = effect_head.tolist()
		results[effect_type + "_effect_layer"] = effect_layer.tolist()
		results[effect_type + "_effect_model"] = effect_model

	return results


def report_intervention(results, effect_types=('indirect', 'direct'), verbose=False):
	"""Report results for single intervention"""

	print(f"x : {results['base_string1']}")
	print(f"x': {results['base_string2']}")
	print(f"c1: {results['candidate1']}")
	print(f"c2: {results['candidate2']}")
	print(f"\np(c2|x) / p(c1|x) = {results['odds_base']:.5f}")
	print(f"p(c2|x') / p(c1|x') = {results['odds_alt']:.5f}")
	print(f"\nTOTAL Effect: (p(c2|x') / p(c1|x')) / (p(c2|x) / p(c1|x)) - 1 = {results['total_effect']:.3f}")

	for effect_type in effect_types:
		if verbose:
			print(f'\n{effect_type.upper()} Effect')
			if effect_type == 'indirect':
				print("   Intervention: replace Attn(x) with Attn(x') in a specific layer/head")
				print(f"   Effect = (p(c2|x, Attn(x')) / p(c1|x, Attn(x')) / (p(c2|x) / p(c1|x)) - 1")
			elif effect_type == 'direct':
				print("   Intervention: replace x with x' while preserving Attn(x) in a specific layer/head")
				print(f"   Effect = (p(c2|x', Attn(x)) / p(c1|x', Attn(x)) / (p(c2|x) / p(c1|x)) - 1")

		plt.figure(figsize=(9, 7))
		ax = sns.heatmap(results[effect_type + '_effect_head'], annot=True, annot_kws={"size": 12}, fmt=".2f")
		ax.set(xlabel='Head', ylabel='Layer', title=f'{effect_type.capitalize()} Effect')


def perform_interventions(interventions, model, effect_types=('indirect', 'direct')):
	"""Perform multiple interventions"""
	results_list = []
	for intervention in tqdm(interventions):
		results = perform_intervention(intervention, model, effect_types)
		results_list.append(results)
	return results_list


def report_interventions_summary_by_head(results, effect_types=('indirect', 'direct'), verbose=False, k=10,
										 show_head_examples=False):
	"""Report summary results for multiple interventions by head"""

	df = pd.DataFrame(results)

	print('*** SUMMARY BY HEAD ***')
	print(f"Num interventions: {len(df)}")
	print(f"Mean total effect: {df.total_effect.mean():.3f}")

	for effect_type in effect_types:
		# Convert column to 3d ndarray (num_examples x num_layers x num_heads)
		effect = np.stack(df[effect_type + '_effect_head'].to_numpy())
		mean_effect = effect.mean(axis=0)
		if effect_type == 'indirect':
			ranking_metric = mean_effect
		else:
			ranking_metric = -mean_effect
		topk_indices = topk_indices(ranking_metric, k)

		# Compute significance levels
		all_values = effect.flatten()
		print(f'\n{effect_type.upper()} Effect (mean = {all_values.mean()})')
		print(f"Top {k} heads:")
		for ind in topk_indices:
			layer, head = np.unravel_index(ind, mean_effect.shape)
			head_values = effect[:, layer, head].flatten()
			tstatistic, pvalue = ttest_ind(head_values, all_values)
			if effect_type == 'indirect':
				assert tstatistic > 0
			else:
				assert tstatistic < 0
			one_tailed_pvalue = pvalue / 2
			print(f'   {layer} {head}: {mean_effect[layer, head]:.3f} (p={one_tailed_pvalue:.4f})')
			if effect_type == 'indirect' and show_head_examples:
				top_results_for_head = sorted(results,
											   key=lambda result: result['indirect_effect_head'][layer][head],
											   reverse=True)
				for result in top_results_for_head[:3]:
					print(f'      {result["indirect_effect_head"][layer][head]:.3f} '
						f'{result["base_string1"]} | {result["candidate1"]} | {result["candidate2"]}')
		if verbose:
			if effect_type == 'indirect':
				print("   Intervention: replace Attn(x) with Attn(x') in a specific layer/head")
				print(f"   Effect = (p(c2|x, Attn(x')) / p(c1|x, Attn(x')) / (p(c2|x) / p(c1|x)) - 1")
			elif effect_type == 'direct':
				print("   Intervention: replace x with x' while preserving Attn(x) in a specific layer/head")
				print(f"   Effect = (p(c2|x', Attn(x)) / p(c1|x', Attn(x)) / (p(c2|x) / p(c1|x)) - 1")
		plt.figure(figsize=(14, 10))
		ax = sns.heatmap(mean_effect, annot=True, annot_kws={"size": 12}, fmt=".2f")
		ax.set(xlabel='Head', ylabel='Layer', title=f'Mean {effect_type.capitalize()} Effect')

def report_interventions_summary_by_layer(results, effect_types=('indirect', 'direct')):
	"""Report summary results for multiple interventions by layer"""

	df = pd.DataFrame(results)

	print('*** SUMMARY BY LAYER ***')
	print(f"Num interventions: {len(df)}")
	print(f"Mean total effect: {df.total_effect.mean():.3f}")

	for effect_type in effect_types:
		# Convert column to 2d ndarray (num_examples x num_layers)
		effect = np.stack(df[effect_type + '_effect_layer'].to_numpy())
		mean_effect = effect.mean(axis=0)
		n_layers = mean_effect.shape[0]

		plt.figure(figsize=(9, 7))
		ax = sns.barplot(x=mean_effect, y=list(range(n_layers)), color="blue", saturation=.3, orient="h")
		ax.set(ylabel='Layer', title=f'Mean {effect_type.capitalize()} Effect')


def get_odds_ratio(intervention, model):
	x = intervention.base_strings_tok[0]
	x_alt = intervention.base_strings_tok[1]
	with torch.no_grad():
		candidate1_base_prob, candidate2_base_prob = model.get_probabilities_for_examples_multitoken(
			x,
			intervention.candidates_tok)
		candidate1_alt_prob, candidate2_alt_prob = model.get_probabilities_for_examples_multitoken(
			x_alt,
			intervention.candidates_tok)

	odds_base = candidate2_base_prob / candidate1_base_prob
	odds_alt = candidate2_alt_prob / candidate1_alt_prob
	return odds_alt / odds_base

def topk_indices(arr, k):
	"""Return indices of top-k values"""
	return (-arr).argsort(axis=None)[:k]


if __name__ == "__main__":
	model = Model(output_attentions=True)
	results = perform_interventions(interventions, model)
	report_interventions_summary_by_layer(results)