8000 Test suite by vmarkovtsev · Pull Request #7 · src-d/go-license-detector · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Test suite #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dist: trusty
sudo: false
sudo: required

language: go

Expand Down
11 changes: 8 additions & 3 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
name = "github.com/jdkato/prose"
version = "1.1.0"

[[constraint]]
[[override]]
name = "github.com/sergi/go-diff"
version = "1.0.0"
revision = "da645544ed44df016359bd4c0e3dc60ee3a0da43"

[[constraint]]
name = "github.com/stretchr/testify"
Expand All @@ -65,6 +65,10 @@
name = "gopkg.in/src-d/go-git.v4"
version = "4.1.0"

[[constraint]]
name = "github.com/spf13/pflag"
branch = "master"

[prune]
go-tests = true
unused-packages = true
2 changes: 1 addition & 1 deletion cmd/license-detector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ func main() {
wg.Add(pflag.NArg())
for argIndex, arg := range pflag.Args() {
go func(argIndex int, arg string) {
defer wg.Done()
_, err := os.Stat(arg)
var licenses map[string]float32
if err == nil {
Expand All @@ -45,7 +46,6 @@ func main() {
os.Exit(1)
}
results[argIndex] = analysisResult{Name: arg, Licenses: licenses}
wg.Done()
}(argIndex, arg)
}
wg.Wait()
Expand Down
Binary file added dataset.zip
Binary file not shown.
66 changes: 66 additions & 0 deletions dataset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package ld

import (
"archive/zip"
"fmt"
"io/ioutil"
"strings"
"sync"
"testing"

"github.com/stretchr/testify/assert"
)

func TestDataset(t *testing.T) {
zipfile, err := zip.OpenReader("dataset.zip")
assert.Nil(t, err)
defer zipfile.Close()
projects := map[string][]*zip.File{}
for _, f := range zipfile.File {
path := strings.Split(f.Name, "/")
if path[1] != "" {
files := projects[path[0]]
if files == nil {
files = []*zip.File{}
}
files = append(files, f)
projects[path[0]] = files
}
}
licenses := map[string]map[string]float32{}
9E7A mutex := sync.Mutex{}
wg := sync.WaitGroup{}
wg.Add(len(projects))
for project, files := range projects {
go func(project string, files []*zip.File) {
defer wg.Done()
myFilesList := make([]string, 0, len(files))
myFilesMap := map[string]*zip.File{}
for _, f := range files {
name := f.Name[strings.Index(f.Name, "/")+1:]
myFilesList = append(myFilesList, name)
myFilesMap[name] = f
}
myLicenses, _ := InvestigateFilesLicenses(myFilesList, func(name string) (string, error) {
reader, err := myFilesMap[name].Open()
if err != nil {
return "", err
}
defer reader.Close()
bytes, err := ioutil.ReadAll(reader)
if err != nil {
return "", err
}
return string(bytes), nil
})
if len(myLicenses) > 0 {
mutex.Lock()
licenses[project] = myLicenses
mutex.Unlock()
}
}(project, files)
}
wg.Wait()
assert.True(t, len(licenses) >= 766)
fmt.Printf("%d %d %d%%\n", len(licenses), len(projects), (100 * len(licenses)) / len(projects))
}
34 changes: 1 addition & 33 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,44 +201,12 @@ func (db *LicenseDatabase) QueryLicenseText(text string) map[string]float32 {
tokarr[len(db.tokens)] = "!"
println(dmp.DiffPrettyText(dmp.DiffCharsToLines(diff, tokarr)))
}

// TODO(vmarkovtsev): replace with dmp.DiffLevenshtein when this PR is merged:
// https://github.com/sergi/go-diff/pull/90
distance := diffLevenshtein(diff)
distance := dmp.DiffLevenshtein(diff)
candidates[key] = float32(1) - float32(distance)/float32(len(myRunes))
}
return candidates
}

func diffLevenshtein(diffs []diffmatchpatch.Diff) int {
levenshtein := 0
insertions := 0
deletions := 0
max := func(a, b int) int {
if a < b {
return b
}
return a
}

for _, aDiff := range diffs {
switch aDiff.Type {
case diffmatchpatch.DiffInsert:
insertions += len(aDiff.Text)
case diffmatchpatch.DiffDelete:
deletions += len(aDiff.Text)
case diffmatchpatch.DiffEqual:
// A deletion and an insertion is one substitution.
levenshtein += max(insertions, deletions)
insertions = 0
deletions = 0
}
}

levenshtein += max(insertions, deletions)
return levenshtein
}

// QueryReadmeText tries to detect licenses mentioned in the README.
func (db *LicenseDatabase) QueryReadmeText(text string) map[string]float32 {
return investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)
Expand Down
24 changes: 12 additions & 12 deletions investigate.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ func InvestigateFilesLicenses(
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
}
licenses := InvestigateReadmeFiles(candidates)
licenses := InvestigateReadmeTexts(candidates)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
}
return licenses, nil
}
return InvestigateLicenseFiles(candidates), nil
return InvestigateLicenseTexts(candidates), nil
}

// ExtractLicenseFiles returns the list of possible license texts.
Expand All @@ -110,12 +110,12 @@ func ExtractLicenseFiles(files []string, reader func(string) (string, error)) []
return candidates
}

// InvestigateLicenseFiles takes the list of candidate license texts and returns the most probable
// InvestigateLicenseTexts takes the list of candidate license texts and returns the most probable
// reference licenses matched. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateLicenseFiles(texts []string) map[string]float32 {
func InvestigateLicenseTexts(texts []string) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateLicenseFile(text)
candidates := InvestigateLicenseText(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
Expand All @@ -126,9 +126,9 @@ func InvestigateLicenseFiles(texts []string) map[string]float32 {
return maxLicenses
}

// InvestigateLicenseFile takes the license text and returns the most probable reference licenses matched.
// InvestigateLicenseText takes the license text and returns the most probable reference licenses matched.
// Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateLicenseFile(text string) map[string]float32 {
func InvestigateLicenseText(text string) map[string]float32 {
return globalLicenseDatabase.QueryLicenseText(text)
}

Expand All @@ -150,12 +150,12 @@ func ExtractReadmeFiles(files []string, reader func(string) (string, error)) []s
return candidates
}

// InvestigateReadmeFiles scans README files for licensing information and outputs the
// InvestigateReadmeTexts scans README files for licensing information and outputs the
// probable names using NER.
func InvestigateReadmeFiles(texts []string) map[string]float32 {
func InvestigateReadmeTexts(texts []string) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateReadmeFile(text)
candidates := InvestigateReadmeText(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
Expand All @@ -166,9 +166,9 @@ func InvestigateReadmeFiles(texts []string) map[string]float32 {
return maxLicenses
}

// InvestigateReadmeFile scans the README file for licensing information and outputs probable
// InvestigateReadmeText scans the README file for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateReadmeFile(text string) map[string]float32 {
func InvestigateReadmeText(text string) map[string]float32 {
return globalLicenseDatabase.QueryReadmeText(text)
}

Expand Down
9 changes: 6 additions & 3 deletions nlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ var (
digitsRe = regexp.MustCompile("[0-9]+")
)

// investigateReadmeFile is the implementation of InvestigateReadmeFile.
// investigateReadmeFile is the implementation of InvestigateReadmeText.
// It takes two additional arguments: licenseNameParts and licenseNameSizes.
// The idea is to map substrings to real licenses, and the confidence is
// <the number of matches> / <overall number of substrings>.
Expand All @@ -41,10 +41,13 @@ func investigateReadmeFile(
text[beginIndex] != '\n' && beginIndex < matches[0][0]; beginIndex++ {
}
}
for ; text[endIndex] != ' ' && text[endIndex] != '\t' &&
text[endIndex] != '\n' && endIndex < len(text); endIndex++ {
for ; endIndex < len(text) && text[endIndex] != ' ' && text[endIndex] != '\t' &&
text[endIndex] != '\n'; endIndex++ {
}
}
if endIndex > len(text) {
endIndex = len(text)
}
suspectedText := text[beginIndex:endIndex]
suspectedWords := tokenize.TextToWords(suspectedText)
tagger := tag.NewPerceptronTagger()
Expand Down
Loading
294D
0