Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support go module #9

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ venv
lexvec_v1.0.4*
corpus
lexvec
.idea
output
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
CC = clang
CC = gcc
CFLAGS = -Ofast -std=gnu99
OBJ = lexvec
BUILD = go build --ldflags '-extldflags "-static"' -o $(OBJ)

optimal:
CC="$(CC)" CGO_CFLAGS="$(CFLAGS) -march=native" $(BUILD)
cd cmd && CC="$(CC)" CGO_CFLAGS="$(CFLAGS) -march=native" $(BUILD) && cd -

cross:
CC="$(CC)" CGO_CFLAGS="$(CFLAGS)" $(BUILD)
cd cmd && CC="$(CC)" CGO_CFLAGS="$(CFLAGS)" $(BUILD) && cd -
2 changes: 1 addition & 1 deletion association.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import "math"

Expand Down
9 changes: 9 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package main

import (
"github.com/hjdo/lexvec"
)

func main() {
lexvec.Build()
}
2 changes: 1 addition & 1 deletion cooc.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"bufio"
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/hjdo/lexvec

go 1.14
121 changes: 119 additions & 2 deletions main.go → lexvec.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,19 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"flag"
"fmt"
"math/rand"
"os"
"path"
"runtime/pprof"
"strings"
)

type OovVectors map[string][]float64
type (
idxUint = uint32
countUint = uint32
Expand Down Expand Up @@ -116,7 +118,7 @@ func init() {
ctxbreakbytes = []byte(ctxBreakToken)
}

func main() {
func Build() {
randng = rand.New(rand.NewSource(1))
flags := flag.NewFlagSet("default", flag.ExitOnError)
flags.StringVar(&corpusPath, "corpus", "", "path to corpus")
Expand Down Expand Up @@ -229,3 +231,118 @@ func main() {

logln(infoLogLevel, "finished!")
}

func GetOovVectors(words string, subvecsOutputPath string) ([]float64, error) {
var (
err error
b = make([]byte, float64Bytes)
vector = make([]float64, 0)
matrixBaseOffset int64
)
subvecsOutput, err := os.Open(subvecsOutputPath)
if err != nil {
logln(errorLogLevel, "open file failed")
}
defer subvecsOutput.Close()

magicNumber := binaryModelReadUint32(subvecsOutput, b)
version := binaryModelReadUint32(subvecsOutput, b)
vocabSize := binaryModelReadUint32(subvecsOutput, b)
subwordMatrixRows := binaryModelReadUint32(subvecsOutput, b)
dim := binaryModelReadUint32(subvecsOutput, b)
subwordMinN := binaryModelReadUint32(subvecsOutput, b)
subwordMaxN := binaryModelReadUint32(subvecsOutput, b)

if magicNumber != binaryModelMagicNumber {
logln(errorLogLevel, "magic number doesnt match")
}
if version != binaryModelVersion {
logln(errorLogLevel, "version number doesnt match")
}

var ivWords []string
ivWordToIdx := make(map[string]int)
for i := 0; i < int(vocabSize); i++ {
wLen := binaryModelReadUint32(subvecsOutput, b)

b := make([]byte, wLen)
if _, err = subvecsOutput.Read(b); err != nil {
return vector, nil
}

w := string(b)
ivWordToIdx[w] = len(ivWords)
ivWords = append(ivWords, w)
}
if matrixBaseOffset, err = subvecsOutput.Seek(0, 1); err != nil {
return vector, nil
}

vec := make([]float64, dim)
if len(words) == 0 {
return vector, err
}
parts := strings.Split(words, " ")
w := parts[0]
var subwords []string
if subwordMinN > 0 && len(parts) == 1 {
subwords = computeSubwords(w, int(subwordMinN), int(subwordMaxN))
} else {
subwords = parts[1:]
}
for j := 0; j < int(dim); j++ {
vec[j] = 0
}
var vLen int
if idx, ok := ivWordToIdx[w]; ok {
sumVecFromBin(subvecsOutput, matrixBaseOffset, vec, idxUint(idx))
vLen++
}
for _, sw := range subwords {
sumVecFromBin(subvecsOutput, matrixBaseOffset, vec, subwordIdx(sw, vocabSize, subwordMatrixRows-vocabSize))
vLen++
}
if vLen > 0 {
for j := 0; j < int(dim); j++ {
vec[j] /= float64(vLen)
}
}
for _, f := range vec {
vector = append(vector, f)
}
return vector, nil
}

func StartTrain(outputFolder, corpusP string,
dimP idxUint, subsampleP real,
minfreqP countUint,
modelP, windowP, negativeP, iterationsP, subwordMinNP int) {
if _, err := os.Stat(outputFolder); os.IsNotExist(err) {
_ = os.Mkdir(outputFolder, 0644)
}

randng = rand.New(rand.NewSource(1))
vocabPath = path.Join(outputFolder, "vocab.txt")
vectorOutputPath = path.Join(outputFolder, "vectors.txt")
subvecsOutputPath = path.Join(outputFolder, "model.bin")
corpusPath = corpusP
dim = dimP
subsample = subsampleP
window = windowP
negative = negativeP
iterations = iterationsP
minFreq = minfreqP
model = modelP
subwordMinN = subwordMinNP

buildVocab()
saveVocab()

readVocab()
processSubwords()
buildCoocMatrix()
calculateCdsTotalAndLogs()
initModel()
train(newTrainIteratorIM())
saveVectors()
}
80 changes: 80 additions & 0 deletions lexvec_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package lexvec

import (
"reflect"
"testing"
)

func TestGetOovVectors(t *testing.T) {
type args struct {
words string
subvecsOutput string
}
tests := []struct {
name string
args args
want OovVectors
wantErr bool
}{
{
name: "test",
args: args{
words: "test the model",
subvecsOutput: "output/model.bin",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetOovVectors(tt.args.words, tt.args.subvecsOutput)
if (err != nil) != tt.wantErr {
t.Errorf("GetOovVectors() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("GetOovVectors() got = %v, want %v", got, tt.want)
}
t.Log(got)
})
}
}

func TestStartTrain(t *testing.T) {
type args struct {
outputFolder string
corpus string
dim idxUint
subsample real
window int
negative int
iterations int
minfreq countUint
model int
subwordMinN int
}
tests := []struct {
name string
args args
}{
{
name: "test",
args: args{
outputFolder: "output",
corpus: "C:/Users/starsc/Downloads/corpus.txt",
dim: 300,
subsample: 1e-5,
window: 2,
negative: 5,
iterations: 5,
minfreq: 100,
model: 0,
subwordMinN: 0,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
StartTrain(tt.args.outputFolder, tt.args.corpus, tt.args.dim, tt.args.subsample, tt.args.minfreq, tt.args.model, tt.args.window, tt.args.negative, tt.args.iterations, tt.args.subwordMinN)
})
}
}
2 changes: 1 addition & 1 deletion matrix.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

type matrix interface {
get(row, col idxUint) countUint
Expand Down
2 changes: 1 addition & 1 deletion model.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"bufio"
Expand Down
2 changes: 1 addition & 1 deletion sampling.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"math"
Expand Down
2 changes: 1 addition & 1 deletion sgd.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

// #include "sgdcgo.h"
import "C"
Expand Down
2 changes: 1 addition & 1 deletion text.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"bufio"
Expand Down
2 changes: 1 addition & 1 deletion train.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"bufio"
Expand Down
6 changes: 3 additions & 3 deletions utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"errors"
Expand All @@ -31,7 +31,7 @@ import (
// Helper for aborting on error.
func check(e error) {
if e != nil {
panic(e)
logln(errorLogLevel, "panic: %v", e)
}
}

Expand Down Expand Up @@ -62,7 +62,7 @@ func doLog(level int, msg string, lineBreak bool, args ...interface{}) {
}
os.Stderr.Sync()
if level == errorLogLevel {
os.Exit(1)
//os.Exit(1)
}
}

Expand Down
2 changes: 1 addition & 1 deletion vocab.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package main
package lexvec

import (
"bufio"
Expand Down