1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
| class EfficientAttention { constructor(d_model, num_heads, attention_type = 'sparse') { this.d_model = d_model; this.num_heads = num_heads; this.attention_type = attention_type; this.head_dim = d_model / num_heads;
this.attention_impl = this.selectAttentionImplementation(); }
selectAttentionImplementation() { switch(this.attention_type) { case 'sparse': return new SparseAttention(this.d_model, this.num_heads); case 'linear': return new LinearAttention(this.d_model, this.num_heads); case 'flash': return new FlashAttention(this.d_model, this.num_heads); default: return new StandardAttention(this.d_model, this.num_heads); } }
forward(query, key, value, mask = null) { return this.attention_impl.compute(query, key, value, mask); } }
class SparseAttention { constructor(d_model, num_heads) { this.d_model = d_model; this.num_heads = num_heads; this.top_k = 64; }
compute(query, key, value, mask = null) { const scores = this.matmul(query, key.transpose(-2, -1));
const sparse_scores = this.applySparsity(scores, this.top_k);
if (mask) { sparse_scores.maskedFill(mask, -Infinity); }
const weights = this.softmax(sparse_scores); return this.matmul(weights, value); }
applySparsity(scores, top_k) { const flatScores = scores.flatten(-2); const topKValues = this.topK(flatScores, top_k);
const mask = this.zerosLike(scores); mask.scatter_(2, topKValues.indices, 1);
return scores * mask; } }
class LinearAttention { constructor(d_model, num_heads) { this.d_model = d_model; this.num_heads = num_heads; }
compute(query, key, value, mask = null) {
const q_prime = this.featureMap(query); const k_prime = this.featureMap(key);
const kv_cumsum = this.cumsum(k_prime.unsqueeze(-1) * value.unsqueeze(-2));
const output = this.matmul(q_prime, kv_cumsum);
const normalizer = this.sum(this.featureMap(key), dim=-2); const q_normalizer = this.matmul(this.featureMap(query), normalizer.unsqueeze(-1));
return output / (q_normalizer + 1e-6); }
featureMap(x) { return this.relu(x); } }
|