Unsupervised learning
M. Benesty
2017-12-10
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 17873 lr: 0.000000 loss: 2.684977 eta: 0h0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5]
## time 0.003701569 -0.02094992 -0.01471546 -0.01789329 0.2266414
## timing 0.013894431 -0.08183648 -0.06753200 0.04775887 0.1996867
## [,6] [,7] [,8] [,9] [,10]
## time 0.002168365 0.06676301 -0.12448528 0.03677384 -0.3228754
## timing -0.007361818 0.08205586 -0.09709805 0.05838207 -0.3673089
## [,11] [,12] [,13] [,14] [,15] [,16]
## time -0.1560771 -0.1089045 -0.1665083 -0.3034483 0.1834774 0.3380671
## timing -0.1767532 -0.1145322 -0.1251771 -0.2920484 0.2287753 0.3240635
## [,17] [,18] [,19] [,20] [,21] [,22]
## time -0.008159334 -0.2633924 -0.1394292 0.2078210 -0.1673992 -0.06434517
## timing 0.033217650 -0.2177957 -0.1610327 0.2199318 -0.2127582 -0.06077509
## [,23] [,24] [,25] [,26] [,27] [,28]
## time 0.03844096 0.07560235 -0.06963576 -0.06772124 0.3206027 -0.1500487
## timing 0.04070080 0.10669171 -0.05562536 -0.11645079 0.2792127 -0.1860814
## [,29] [,30] [,31] [,32] [,33] [,34]
## time 0.1248527 0.1448268 -0.1828063 -0.03314526 -0.4563349 0.1622250
## timing 0.1560311 0.1518059 -0.1956239 -0.07268007 -0.5754561 0.2329637
## [,35] [,36] [,37] [,38] [,39] [,40]
## time 0.18239672 -0.07182057 -0.2125761 -0.1732862 -0.1412876 -0.2736001
## timing 0.09566456 -0.02583711 -0.2234931 -0.1586431 -0.1038917 -0.3147692
## [,41] [,42] [,43] [,44] [,45] [,46]
## time -0.1657367 0.1987531 0.2765933 0.07641257 -0.2995409 -0.2836653
## timing -0.1617544 0.3175420 0.2641679 0.09102539 -0.3341857 -0.2722032
## [,47] [,48] [,49] [,50] [,51] [,52]
## time 0.08336268 0.1564988 0.017310962 0.2266703 -0.01006304 -0.08686765
## timing 0.07831305 0.0824019 -0.007391307 0.2651779 -0.01950596 -0.06850933
## [,53] [,54] [,55] [,56] [,57] [,58]
## time 0.2366780 0.08278372 -0.0991618 -0.2184374 -0.1216459 -0.2547537
## timing 0.3462267 0.04171500 -0.1381191 -0.2058623 -0.1362129 -0.2276540
## [,59] [,60] [,61] [,62] [,63] [,64]
## time -0.06431224 0.101693 -0.1573385 -0.04101235 -0.1416416 0.2821039
## timing -0.10218949 0.129319 -0.1646961 -0.05727104 -0.1458822 0.2469770
## [,65] [,66] [,67] [,68] [,69]
## time 0.04452583 -0.03029095 0.03789760 -0.09694828 -0.07512284
## timing -0.02036526 -0.04895821 0.00385429 -0.08576424 -0.12879328
## [,70] [,71] [,72] [,73] [,74]
## time -0.02975520 0.02787634 -0.002216312 -0.05120584 -0.15087977
## timing -0.06647979 0.05113344 -0.009569229 -0.04806627 -0.09930108
## [,75] [,76] [,77] [,78] [,79] [,80]
## time 0.04904853 0.07739296 -0.02614848 0.1387951 0.002968461 0.2118717
## timing 0.02961397 0.12361205 0.03486326 0.1127628 -0.070090353 0.1957414
## [,81] [,82] [,83] [,84] [,85]
## time -0.03165758 0.1319712 -0.05060453 0.1214691 -0.011487569
## timing 0.02906851 0.1278033 -0.07433826 0.1201687 -0.007929295
## [,86] [,87] [,88] [,89] [,90] [,91]
## time -0.06246862 0.01292722 0.1442447 -0.1338669 0.005476377 0.3219993
## timing -0.07597567 0.02941429 0.1507549 -0.1205056 0.007609359 0.3645059
## [,92] [,93] [,94] [,95] [,96] [,97]
## time 0.1025874 0.1906714 -0.06372983 -0.1888554 0.03824912 -0.2237726
## timing 0.1045415 0.1501275 -0.01279421 -0.1794955 0.08756206 -0.2023151
## [,98] [,99] [,100]
## time -0.2652794 -0.1595625 -0.11587554
## timing -0.2796857 -0.1236199 -0.07190448
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.02651878
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 557412 29.8 940480 50.3 940480 50.3
## Vcells 1157783 8.9 1943014 14.9 1535844 11.8