Unsupervised learning

M. Benesty

2017-12-10

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0%  words/sec/thread: 17873  lr: 0.000000  loss: 2.684977  eta: 0h0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##               [,1]        [,2]        [,3]        [,4]      [,5]
## time   0.003701569 -0.02094992 -0.01471546 -0.01789329 0.2266414
## timing 0.013894431 -0.08183648 -0.06753200  0.04775887 0.1996867
##                [,6]       [,7]        [,8]       [,9]      [,10]
## time    0.002168365 0.06676301 -0.12448528 0.03677384 -0.3228754
## timing -0.007361818 0.08205586 -0.09709805 0.05838207 -0.3673089
##             [,11]      [,12]      [,13]      [,14]     [,15]     [,16]
## time   -0.1560771 -0.1089045 -0.1665083 -0.3034483 0.1834774 0.3380671
## timing -0.1767532 -0.1145322 -0.1251771 -0.2920484 0.2287753 0.3240635
##               [,17]      [,18]      [,19]     [,20]      [,21]       [,22]
## time   -0.008159334 -0.2633924 -0.1394292 0.2078210 -0.1673992 -0.06434517
## timing  0.033217650 -0.2177957 -0.1610327 0.2199318 -0.2127582 -0.06077509
##             [,23]      [,24]       [,25]       [,26]     [,27]      [,28]
## time   0.03844096 0.07560235 -0.06963576 -0.06772124 0.3206027 -0.1500487
## timing 0.04070080 0.10669171 -0.05562536 -0.11645079 0.2792127 -0.1860814
##            [,29]     [,30]      [,31]       [,32]      [,33]     [,34]
## time   0.1248527 0.1448268 -0.1828063 -0.03314526 -0.4563349 0.1622250
## timing 0.1560311 0.1518059 -0.1956239 -0.07268007 -0.5754561 0.2329637
##             [,35]       [,36]      [,37]      [,38]      [,39]      [,40]
## time   0.18239672 -0.07182057 -0.2125761 -0.1732862 -0.1412876 -0.2736001
## timing 0.09566456 -0.02583711 -0.2234931 -0.1586431 -0.1038917 -0.3147692
##             [,41]     [,42]     [,43]      [,44]      [,45]      [,46]
## time   -0.1657367 0.1987531 0.2765933 0.07641257 -0.2995409 -0.2836653
## timing -0.1617544 0.3175420 0.2641679 0.09102539 -0.3341857 -0.2722032
##             [,47]     [,48]        [,49]     [,50]       [,51]       [,52]
## time   0.08336268 0.1564988  0.017310962 0.2266703 -0.01006304 -0.08686765
## timing 0.07831305 0.0824019 -0.007391307 0.2651779 -0.01950596 -0.06850933
##            [,53]      [,54]      [,55]      [,56]      [,57]      [,58]
## time   0.2366780 0.08278372 -0.0991618 -0.2184374 -0.1216459 -0.2547537
## timing 0.3462267 0.04171500 -0.1381191 -0.2058623 -0.1362129 -0.2276540
##              [,59]    [,60]      [,61]       [,62]      [,63]     [,64]
## time   -0.06431224 0.101693 -0.1573385 -0.04101235 -0.1416416 0.2821039
## timing -0.10218949 0.129319 -0.1646961 -0.05727104 -0.1458822 0.2469770
##              [,65]       [,66]      [,67]       [,68]       [,69]
## time    0.04452583 -0.03029095 0.03789760 -0.09694828 -0.07512284
## timing -0.02036526 -0.04895821 0.00385429 -0.08576424 -0.12879328
##              [,70]      [,71]        [,72]       [,73]       [,74]
## time   -0.02975520 0.02787634 -0.002216312 -0.05120584 -0.15087977
## timing -0.06647979 0.05113344 -0.009569229 -0.04806627 -0.09930108
##             [,75]      [,76]       [,77]     [,78]        [,79]     [,80]
## time   0.04904853 0.07739296 -0.02614848 0.1387951  0.002968461 0.2118717
## timing 0.02961397 0.12361205  0.03486326 0.1127628 -0.070090353 0.1957414
##              [,81]     [,82]       [,83]     [,84]        [,85]
## time   -0.03165758 0.1319712 -0.05060453 0.1214691 -0.011487569
## timing  0.02906851 0.1278033 -0.07433826 0.1201687 -0.007929295
##              [,86]      [,87]     [,88]      [,89]       [,90]     [,91]
## time   -0.06246862 0.01292722 0.1442447 -0.1338669 0.005476377 0.3219993
## timing -0.07597567 0.02941429 0.1507549 -0.1205056 0.007609359 0.3645059
##            [,92]     [,93]       [,94]      [,95]      [,96]      [,97]
## time   0.1025874 0.1906714 -0.06372983 -0.1888554 0.03824912 -0.2237726
## timing 0.1045415 0.1501275 -0.01279421 -0.1794955 0.08756206 -0.2023151
##             [,98]      [,99]      [,100]
## time   -0.2652794 -0.1595625 -0.11587554
## timing -0.2796857 -0.1236199 -0.07190448
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.02651878
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  557412 29.8     940480 50.3   940480 50.3
## Vcells 1157783  8.9    1943014 14.9  1535844 11.8