Model Steering
Model steering manipulates model activations to control outputs without retraining. This includes one-off interventions, persistent edits, and steering vector techniques.
Basic Steering Intervention
Modify activations during a single forward pass:
from nnsight import LanguageModel
import torch
model = LanguageModel("openai-community/gpt2", device_map="auto", dispatch=True)
# Add a steering vector to layer 10
steering_vector = torch.randn(768) # Match hidden dimension
with model.trace("I think the movie was") as tracer:
# Add steering vector to residual stream
model.transformer.h[10].output[0][:, -1, :] += steering_vector
steered_logits = model.lm_head.output.save()
Computing Steering Vectors
Contrastive Activation Difference
positive_prompts = [
"I love this! It's fantastic",
"This is wonderful and amazing",
"I'm so happy about this"
]
negative_prompts = [
"I hate this! It's terrible",
"This is awful and horrible",
"I'm so sad about this"
]
layer_idx = 10
positive_acts = []
negative_acts = []
with model.trace() as tracer:
for prompt in positive_prompts:
with tracer.invoke(prompt):
act = model.transformer.h[layer_idx].output[0][:, -1, :].save()
positive_acts.append(act)
for prompt in negative_prompts:
with tracer.invoke(prompt):
act = model.transformer.h[layer_idx].output[0][:, -1, :].save()
negative_acts.append(act)
# Average difference = steering vector
pos_mean = torch.stack([a.value for a in positive_acts]).mean(dim=0)
neg_mean = torch.stack([a.value for a in negative_acts]).mean(dim=0)
steering_vector = pos_mean - neg_mean
Applying the Steering Vector
steering_strength = 1.5
with model.trace("The weather today is"):
# Steer toward positive sentiment
model.transformer.h[layer_idx].output[0][:, :, :] += steering_strength * steering_vector
output = model.lm_head.output.save()
# Generate with steering (apply to all generation steps)
with model.generate("The weather today is", max_new_tokens=20) as tracer:
with tracer.all(): # Apply intervention to every generation iteration
model.transformer.h[layer_idx].output[0][:, -1, :] += steering_strength * steering_vector
generated = model.generator.output.save()
print(model.tokenizer.decode(generated[0]))
Persistent Model Editing
Create a modified model version that persists across traces:
# Extract activations from a "source" prompt
with model.trace("The capital of France is Paris") as tracer:
paris_hidden = model.transformer.h[-1].output[0].save()
# Create persistently edited model
with model.edit() as edited_model:
model.transformer.h[-1].output[0][:] = paris_hidden.value
# Use edited model - will always output Paris-related content
with edited_model.trace("The capital of Germany is"):
logits = model.lm_head.output.save()
# Can use edited model multiple times
with edited_model.trace("The capital of Japan is"):
logits2 = model.lm_head.output.save()
Ablation Studies
Zero out components to test necessity:
# Zero ablation of attention head
head_idx = 5
head_dim = model.config.n_embd // model.config.n_head
start = head_idx * head_dim
end = (head_idx + 1) * head_dim
with model.trace(prompt):
# Zero this head's contribution
attn_out = model.transformer.h[10].attn.c_proj.input[0][0]
attn_out[:, :, start:end] = 0
ablated_logits = model.lm_head.output.save()
Mean Ablation
Replace with mean activation (less disruptive):
# First compute mean activation over many prompts
mean_acts = []
with model.trace() as tracer:
for prompt in calibration_prompts:
with tracer.invoke(prompt):
act = model.transformer.h[10].output[0].save()
mean_acts.append(act)
mean_activation = torch.stack([a.value for a in mean_acts]).mean(dim=0)
# Apply mean ablation
with model.trace(test_prompt):
model.transformer.h[10].output[0][:] = mean_activation
ablated_logits = model.lm_head.output.save()
Activation Addition (ActAdd)
Add vectors at inference time without modifying weights:
def actadd_generate(model, prompt, steering_vector, layer, strength=1.0, max_tokens=50):
with model.generate(prompt, max_new_tokens=max_tokens) as tracer:
with tracer.all(): # Apply to all generation iterations
model.transformer.h[layer].output[0][:, -1, :] += strength * steering_vector
output = model.generator.output.save()
return model.tokenizer.decode(output[0])
Linear Probing for Steering
Train a probe to find steering directions:
from sklearn.linear_model import LogisticRegression
# Collect activations with labels
activations = []
labels = []
with model.trace() as tracer:
for prompt, label in labeled_data:
with tracer.invoke(prompt):
act = model.transformer.h[layer_idx].output[0][:, -1, :].save()
activations.append(act)
labels.append(label)
X = torch.stack([a.value.squeeze() for a in activations]).numpy()
y = labels
# Train probe
probe = LogisticRegression()
probe.fit(X, y)
# Use probe coefficients as steering vector
steering_direction = torch.tensor(probe.coef_[0])
Multi-Layer Steering
Apply steering across multiple layers:
layer_weights = {6: 0.5, 8: 1.0, 10: 1.5, 12: 1.0} # Layer: strength
with model.generate(prompt, max_new_tokens=30) as tracer:
with tracer.all(): # Apply to all generation iterations
for layer_idx, strength in layer_weights.items():
model.transformer.h[layer_idx].output[0][:, -1, :] += \
strength * steering_vector
generated = model.generator.output.save()
print(model.tokenizer.decode(generated[0]))
Steering Vector Analysis
Interpret what your steering vector represents:
# Project steering vector through unembedding
steering_logits = model.lm_head.weight @ steering_vector
# Top tokens affected
top_k = 20
top_values, top_indices = steering_logits.topk(top_k)
bottom_values, bottom_indices = steering_logits.topk(top_k, largest=False)
print("Tokens boosted by steering:")
for val, idx in zip(top_values, top_indices):
print(f" {model.tokenizer.decode(idx)}: {val:.3f}")
print("\nTokens suppressed by steering:")
for val, idx in zip(bottom_values, bottom_indices):
print(f" {model.tokenizer.decode(idx)}: {val:.3f}")
Best Practices
- Layer selection: Middle-to-late layers often work best
- Strength tuning: Start small (0.1-0.5), increase gradually
- Validation: Test on diverse prompts to avoid overfitting
- Orthogonalization: Remove overlap with other behaviors
- Position: Final token position usually most effective
Scan to join WeChat group