in a high-dimensional space, 768 dimensions. Words with related meanings often appear close together in the embedding space. The figure plots the embeddings of a few semantically related tokens, projected into two dimensions using Principal Component Analysis (PCA). In [ ]: words = ["father","mother","brother","sister", # family ### candidate words ### "man","woman","boy","girl", # gender "dog","cat","horse","cow"] # animals # words = ["London","Paris","Madrid","Rome","Oslo","Berlin" # "UK","France","Spain","Italy","Norway","Germany" # ] # animals tokens = [] ### convert words → tokens ### for w in words: ids = tokenizer.encode(w, add_special_tokens=False) for tid in ids: tokens.append(tokenizer.decode([tid])) vectors = [] ### collect embedding vectors ### for w in tokens: token_id = tokenizer.encode(w, add_special_tokens=False)[0] vec = model.transformer.wte.weight[token_id].detach().cpu().numpy() vectors.append(vec) vectors = np.stack(vectors) X = vectors - vectors.mean(axis=0) ### PCA projection ### U, S, Vt = np.linalg.svd(X, full_matrices=False) coords = X @ Vt[:2].T plt.figure(figsize=(8,6)) ### Plot ### plt.scatter(coords[:,0], coords[:,1]) for i, word in enumerate(tokens): plt.text(coords[i,0], coords[i,1], word, fontsize=11) plt.title("GPT-2 Token Embedding Space"); plt.axis("off"); plt.show() 16/3/26, 17:14 llmtalk_slides slides file://wsl.localhost/Ubuntu/home/afp/docs/llmtalk/talkv2/notebooks/llmtalk_slides.slides.html#/ 19/34