# Word2Vec from scratch

This [self-contained implementation](https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy) is instructive and you should [go through it](https://github.com/nathanrooy/word2vec-from-scratch-with-python/blob/master/word2vec.py) to understand the word2vec embedding. 

In [7]:
import numpy as np
import re
from collections import defaultdict



class word2vec():
    def __init__ (self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        pass
    
    
    # GENERATE TRAINING DATA
    def generate_training_data(self, settings, corpus):

        # GENERATE WORD COUNTS
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1

        self.v_count = len(word_counts.keys())

        # GENERATE LOOKUP DICTIONARIES
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

        training_data = []
        # CYCLE THROUGH EACH SENTENCE IN CORPUS
        for sentence in corpus:
            sent_len = len(sentence)

            # CYCLE THROUGH EACH WORD IN SENTENCE
            for i, word in enumerate(sentence):
                
                #w_target  = sentence[i]
                w_target = self.word2onehot(sentence[i])

                # CYCLE THROUGH CONTEXT WINDOW
                w_context = []
                for j in range(i-self.window, i+self.window+1):
                    if j!=i and j<=sent_len-1 and j>=0:
                        w_context.append(self.word2onehot(sentence[j]))
                training_data.append([w_target, w_context])
        return np.array(training_data)


    # SOFTMAX ACTIVATION FUNCTION
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)


    # CONVERT WORD TO ONE HOT ENCODING
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec


    # FORWARD PASS
    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u
                

    # BACKPROPAGATION
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        # UPDATE WEIGHTS
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        pass


    # TRAIN W2V model
    def train(self, training_data):
        # INITIALIZE WEIGHT MATRICES
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            self.loss = 0

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:

                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_t)
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))
                
            print('EPOCH:',i, 'LOSS:', self.loss)
        pass


    # input a word, returns a vector (if available)
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w


    # input a vector, returns nearest word(s)
    def vec_sim(self, vec, top_n):

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(vec, v_w2)
            theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda item: item[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word, sim)
            
        pass

    # input word, returns top [n] most similar words
    def word_sim(self, word, top_n):
        
        w1_index = self.word_index[word]
        v_w1 = self.w1[w1_index]

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda item: item[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word, sim)
            
        pass



In [8]:

settings = {}
settings['n'] = 5                   # dimension of word embeddings
settings['window_size'] = 2         # context window +/- center word
settings['min_count'] = 0           # minimum word count
settings['epochs'] = 30000           # number of training epochs
settings['neg_samp'] = 10           # number of negative words to use during training
settings['learning_rate'] = 0.01    # learning rate
np.random.seed(0)                   # set the seed for reproducibility

corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]

# INITIALIZE W2V MODEL
w2v = word2vec()

# generate training data
training_data = w2v.generate_training_data(settings, corpus)

# train word2vec model
w2v.train(training_data)


  return np.array(training_data)


EPOCH: 0 LOSS: 68.37096376709992
EPOCH: 1 LOSS: 67.88313773129582
EPOCH: 2 LOSS: 67.43127505997181
EPOCH: 3 LOSS: 67.0112049943476
EPOCH: 4 LOSS: 66.61935410862232
EPOCH: 5 LOSS: 66.25264389886307
EPOCH: 6 LOSS: 65.9084083928434
EPOCH: 7 LOSS: 65.5843274465069
EPOCH: 8 LOSS: 65.2783724111086
EPOCH: 9 LOSS: 64.98876161794144
EPOCH: 10 LOSS: 64.71392370243775
EPOCH: 11 LOSS: 64.45246722531292
EPOCH: 12 LOSS: 64.20315538093011
EPOCH: 13 LOSS: 63.96488483826672
EPOCH: 14 LOSS: 63.736667956900455
EPOCH: 15 LOSS: 63.51761777345709
EPOCH: 16 LOSS: 63.30693527348039
EPOCH: 17 LOSS: 63.103898557554714
EPOCH: 18 LOSS: 62.90785358463229
EPOCH: 19 LOSS: 62.71820623435013
EPOCH: 20 LOSS: 62.5344154770562
EPOCH: 21 LOSS: 62.355987477886515
EPOCH: 22 LOSS: 62.18247049153716
EPOCH: 23 LOSS: 62.01345042888875
EPOCH: 24 LOSS: 61.8485469965609
EPOCH: 25 LOSS: 61.687410326726464
EPOCH: 26 LOSS: 61.529718027831066
EPOCH: 27 LOSS: 61.375172597813304
EPOCH: 28 LOSS: 61.22349915045999
EPOCH: 29 LOSS: 61.07444

In [9]:

print('WORD EMBEDDINGS:', w2v.w1)

WORD EMBEDDINGS: [[-2.23674708  1.03244829  0.82157013 -0.64676677 -2.22089404]
 [ 1.56739884  1.02944415 -0.20785105  3.55926319 -0.97603172]
 [ 0.0606542  -0.40922824  1.26143234 -3.78443936  0.58368752]
 [-0.34960732 -3.00731856  1.11198674  0.42891148  0.69661937]
 [ 0.8663663  -2.10375673 -2.65276693  0.95982568 -1.21829331]
 [ 0.07597795  0.90009406  2.74946442  2.05932677 -1.01954995]
 [-2.66610952 -1.54780811  0.29743046  0.43074061  2.38359804]
 [ 1.0461709   0.47387405 -0.1179152  -0.62239887  0.44490241]]
