-
Notifications
You must be signed in to change notification settings - Fork 69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: add javascript tokenizer #4
base: master
Are you sure you want to change the base?
Changes from 1 commit
457ff40
eb49905
8ea9116
10fde12
8ab4407
bd4272b
d41550e
962360b
49cd0c2
298ed54
fce1f14
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
*.swp | ||
*.swo | ||
*.log | ||
|
||
node_modules | ||
.DS_Store |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
const esprima = require('esprima') | ||
const escodegen = require('escodegen') | ||
const fs = require('fs-extra-promise') | ||
const tokenizer = require('./tokenizer') | ||
|
||
const immutable = require('immutable') | ||
const walk = require('esprima-walk') | ||
|
||
const { base64FileName } = require('./util') | ||
|
||
const estools = require('estools') | ||
|
||
const TOKENIZER_SCOPE_FILE = 'file-scope' | ||
const TOKENIZER_SCOPE_FUNCTION = 'function-scope' | ||
|
||
const TOKENIZER_SCOPE = TOKENIZER_SCOPE_FILE | ||
|
||
// TODO: estools map / filter / traverse (instead of walk) | ||
// - filter subfunctions from fuction asts somehow | ||
// - test on SCC | ||
|
||
// TODO: get rid of the function block and indentation | ||
const regenerateFunctionCode = function(functionAst) { | ||
codegenOptions = { // NOTE: doesn't help | ||
format: { | ||
parentheses: false | ||
} | ||
} | ||
|
||
// NOTE: functionAst.body ommits the function signature (returns block only) | ||
return escodegen.generate(functionAst.body, {}) | ||
} | ||
|
||
const processFile = function(fileName, data) { | ||
//let parentId = base64FileName(fileName) // TODO: incorporate repo name / hash | ||
let parentId = fileName | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SourcererCC, expects the parentId and blockId to be of type Integers (or Long). |
||
let blockId = 1 | ||
|
||
if (TOKENIZER_SCOPE === TOKENIZER_SCOPE_FILE) { | ||
return immutable.List.of(tokenizer(data, parentId, blockId)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's increment the blockId before we call the tokenizer. Or do you increment it somewhere else? |
||
} | ||
|
||
options = { | ||
loc: true, | ||
range: true, | ||
comment: true, | ||
attachComment: true | ||
} | ||
fileAst = esprima.parse(data, {}); | ||
|
||
let functions = immutable.List() | ||
let functionTokens = immutable.List() | ||
walk(fileAst, (node) => { | ||
if (node.type == 'FunctionExpression') { | ||
// const functionAstShallow = estools.map(node, (subNode) => { | ||
// if (subNode === undefined || subNode.type === undefined) return | ||
// if (subNode.type == 'FunctionExpression') | ||
// return {} | ||
// else return subNode | ||
// }) | ||
//console.log(functionAstShallow) | ||
//process.exit(1) | ||
const functionAstShallow = node | ||
const functionCode = regenerateFunctionCode(functionAstShallow) | ||
functions = functions.push(functionCode) | ||
const tokenizedFunction = tokenizer(functionCode, parentId, blockId++) | ||
functionTokens = functionTokens.push(tokenizedFunction) | ||
} | ||
}) | ||
|
||
return functionTokens | ||
} | ||
|
||
|
||
const outputFile = function(functionTokens) { | ||
functionTokens.forEach((f) => { | ||
//console.log("===") | ||
console.log(f) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's specify an output file and print our results to that file. Sometimes developers might want to debug using plain old school console.log approach and so let's use console only for the debug or auxiliary info. |
||
//console.log("===") | ||
}) | ||
} | ||
|
||
// TODO: check input | ||
const fileName = process.argv[2] | ||
|
||
fs.readFileAsync(fileName).then((data) => { | ||
outputFile(processFile(process.argv[3], data)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add a comment describing what is expected in different indexes of argv |
||
}); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"name": "jstokenizer", | ||
"version": "1.0.0", | ||
"description": "", | ||
"main": "index.js", | ||
"scripts": { | ||
"test": "echo \"Error: no test specified\" && exit 1" | ||
}, | ||
"author": "Jakub Žitný <jakubzitny@avocode.com> (https://github.com/jakubzitny)", | ||
"license": "ISC", | ||
"dependencies": { | ||
"escodegen": "^1.8.0", | ||
"esprima": "^2.7.2", | ||
"esprima-ast-utils": "0.0.6", | ||
"esprima-walk": "^0.1.0", | ||
"estools": "^2.1.0", | ||
"fs-extra-promise": "^0.3.1", | ||
"immutable": "^3.8.1", | ||
"lodash": "^4.13.1" | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# JavaScript tokenizer for SourcererCC | ||
|
||
- use `node 6.*` | ||
- run as `node index.js /path/to/file.js` | ||
- (carefully) use `batch.sh` to apply to larger dataset |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
const _ = require('lodash') | ||
const immutable = require('immutable') | ||
const fs = require('fs-extra-promise') | ||
const esprima = require('esprima') | ||
|
||
const MAIN_DELIMITER = '@#@' | ||
const COUNT_DELIMITER = '@@::@@' | ||
const TOKEN_DELIMITER = ',' | ||
|
||
const filterTokens = function (type, token) { | ||
return token.type == type | ||
} | ||
|
||
|
||
// NOTE: http://esprima.org/doc/#usage | ||
const tokenTypes = immutable.List.of( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it a dead code? |
||
'Boolean', | ||
'Identifier', | ||
'Keyword', | ||
'Null', | ||
'Numeric', | ||
'Punctuator', | ||
'String', | ||
'RegularExpression' | ||
) | ||
|
||
const tokenFilters = tokenTypes.map((tokenType) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it a dead code? |
||
return _.partial(filterTokens, tokenType) | ||
}) | ||
|
||
const tokenizer = function(code, parentId, blockId) { | ||
const options = { } | ||
const tokens = immutable.List(esprima.tokenize(code, options)) | ||
|
||
// TODO: reduce to map | ||
// const filteredTokens = tokenFilters.map((tokenFilter) => { | ||
// return tokens.filter(tokenFilter) | ||
// }) | ||
|
||
let uniqueTokens = immutable.Map() | ||
tokens.forEach((token) => { | ||
if (uniqueTokens.has(token.value)) { | ||
newUniqueTokens = uniqueTokens.updateIn( | ||
[ token.value ], | ||
(count) => { | ||
return count + 1 | ||
}) | ||
} else { | ||
newUniqueTokens = uniqueTokens.set(token.value, 1) | ||
} | ||
uniqueTokens = newUniqueTokens | ||
}) | ||
|
||
const tokenPairs = uniqueTokens.map((count, token) => { | ||
return `${token}${COUNT_DELIMITER}${count}` | ||
}) | ||
|
||
const lhs = `${parentId},${blockId},` | ||
const rhs = tokenPairs.join(TOKEN_DELIMITER) | ||
const output = `${lhs}${MAIN_DELIMITER}${rhs}` | ||
|
||
return output | ||
}; | ||
|
||
module.exports = tokenizer |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
|
||
const base64FileName = function(fileName) { | ||
const fileNameBuffer = Buffer.from(fileName) | ||
return fileNameBuffer.toString('base64') | ||
} | ||
|
||
|
||
module.exports = { | ||
base64FileName | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is good, but is it also possible to extract function signature and append the body to it? we can add a TODO maybe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is possible. Is it needed though? Do other tokenizers include the signature? Maybe I'll add it into config file.
Because when we do queries with SourcererCC, e.g. snippets from StackOverflow, I think the signature is not always good idea for finding a match, especially when the snippet is only few lines of simple expressions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, other parsers do use method signatures. SourcererCC uses everything in a method except for comments and some symbols. It gives good recall as well as precision with current strategy. May be in future we can setup an experiment to test the hypothesis (clone detection without using method signatures.). But for now, lets use it and make it consistent with other parsers.