Unsupervised learning

M. Benesty

2019-02-03

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   17252 lr:  0.000000 loss:  2.780208 ETA:   0h 0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##               [,1]       [,2]        [,3]       [,4]      [,5]       [,6]
## time   -0.06881645 0.13881408 -0.07818315 0.01426182 0.2177156 0.08689585
## timing -0.07808673 0.07722373 -0.14528590 0.08732751 0.1731965 0.09384152
##               [,7]        [,8]          [,9]     [,10]      [,11]
## time   -0.04004750 0.001533373 -0.0007253154 0.1399308 0.09852907
## timing -0.01827694 0.034195002  0.0421412103 0.1247030 0.08031375
##             [,12]       [,13]      [,14]        [,15]      [,16]
## time   -0.1191767 -0.10353918 0.02764916 -0.033247512 0.14442311
## timing -0.1042185 -0.02960716 0.04948580  0.009944195 0.09607919
##              [,17]       [,18]       [,19]      [,20]      [,21]
## time   -0.09005778 -0.02592032 -0.08515928 0.02139921 -0.3099537
## timing -0.05461452  0.01770881 -0.09622553 0.01220419 -0.3380381
##              [,22]      [,23]       [,24]      [,25]      [,26]     [,27]
## time   -0.09953389 0.06612019 -0.08599593 -0.2367783 -0.0942620 0.3037152
## timing -0.12494829 0.05502555 -0.07126109 -0.2008297 -0.1557036 0.2433577
##             [,28]     [,29]      [,30]       [,31]      [,32]      [,33]
## time   -0.4025318 0.2154185 0.04211997 -0.08983453 0.10557507 -0.1719915
## timing -0.4536428 0.2121897 0.05420741 -0.10548190 0.07882373 -0.2462448
##            [,34]       [,35]      [,36]      [,37]        [,38]
## time   0.2924196  0.05263206 0.01956245 -0.4029368 -0.066329472
## timing 0.3792267 -0.04600971 0.08197001 -0.3809043 -0.004167411
##               [,39]      [,40]      [,41]    [,42]     [,43]     [,44]
## time   -0.052381329 -0.2036445 -0.2397877 0.139930 0.3381488 0.1291182
## timing -0.007633572 -0.2187377 -0.2367515 0.206506 0.2946863 0.1510328
##              [,45]      [,46]      [,47]     [,48]      [,49]      [,50]
## time   -0.03193269 -0.2378970 0.06859719 0.1825551 -0.2686876 0.08651234
## timing -0.08173822 -0.2124958 0.07006828 0.1119169 -0.3248180 0.12154076
##             [,51]      [,52]     [,53]        [,54]      [,55]      [,56]
## time   0.10821523 -0.1680679 0.1546575  0.003572652 -0.3960227 -0.1705540
## timing 0.06919222 -0.1597644 0.2407349 -0.060087670 -0.4159634 -0.1516206
##               [,57]      [,58]     [,59]      [,60]      [,61]      [,62]
## time   -0.024260765 -0.1659616 0.1821883 0.03706855 -0.2763467 0.02407292
## timing -0.003398884 -0.1218584 0.1684446 0.02031613 -0.2956797 0.01568252
##            [,63]       [,64]       [,65]      [,66]       [,67]      [,68]
## time   -0.128762 0.070527546  0.02317665 -0.1285444 -0.05639143 -0.3069461
## timing -0.111182 0.009115928 -0.05184232 -0.1561003 -0.12208246 -0.2982798
##             [,69]     [,70]      [,71]        [,72]      [,73]       [,74]
## time   0.10994899 0.2068656 0.05114485 -0.001922393 -0.1825254 -0.04509224
## timing 0.06163164 0.1852793 0.06777245 -0.011101018 -0.1479235  0.01004949
##              [,75]      [,76]      [,77]     [,78]        [,79]      [,80]
## time   -0.02210893 -0.2974024 0.01545263 0.2451846 0.0704050213 0.02826298
## timing -0.03507486 -0.2909532 0.04262315 0.2054351 0.0006871345 0.01791780
##             [,81]      [,82]      [,83]      [,84]       [,85]      [,86]
## time   -0.2427916 0.12106141 -0.1213794 0.06143517 -0.02094532 -0.1544727
## timing -0.1801297 0.08734585 -0.1432493 0.07163851 -0.03429535 -0.2089804
##            [,87]      [,88]       [,89]      [,90]     [,91]       [,92]
## time   0.2886949 0.02271891 -0.07134578 0.03566511 0.1773788 -0.01886125
## timing 0.2790698 0.02051279 -0.08172692 0.06682864 0.2322661 -0.01755186
##             [,93]       [,94]      [,95]      [,96]      [,97]       [,98]
## time   0.11287729 -0.05840375 -0.3206510 0.07719425 -0.1677442 -0.08019492
## timing 0.04483107 -0.02629983 -0.3250969 0.16781205 -0.1345256 -0.05601808
##             [,99]      [,100]
## time   0.02656443 -0.13345182
## timing 0.06882897 -0.09492695
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.03398623
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  579243 31.0    1172200 62.7  1172200 62.7
## Vcells 1249953  9.6    8388608 64.0  1758476 13.5