@inproceedings{704111d1d1f94cd681c91f343bf0e1d3,
title = "Python source code de-anonymization using nested bigrams",
abstract = "An important issue in cybersecurity is the insertion or modification of code by individuals other than the original authors of the code. This motivates research on authorship attribution of unknown source code. We have addressed the deficiencies of previously used feature extraction methods and propose a novel approach: Nested Bigrams. Such features are easy to extract and carry substantial information about the interconnections between the nodes of the abstract syntax tree. We also show that for large number of authors, a Strongly Regularized Feed-forward Neural Network outperforms the Random Forest Classifier used in many code stylometric studies. A new ranking system for reducing the number of features is also proposed, and experiments show that this approach can reduce the feature set to 98 nested bigrams while maintaining a classification accuracy above 90 percent.",
keywords = "abstract syntax tree, feature extraction, feature ranking, source code de-anonymization, source code stylometry",
author = "Pegah Hozhabrierdi and {Fuentes Hitos}, Dunai and Mohan, {Chilukuri K.}",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 18th IEEE International Conference on Data Mining Workshops, ICDMW 2018 ; Conference date: 17-11-2018 Through 20-11-2018",
year = "2018",
month = jul,
day = "2",
doi = "10.1109/ICDMW.2018.00011",
language = "English (US)",
series = "IEEE International Conference on Data Mining Workshops, ICDMW",
publisher = "IEEE Computer Society",
pages = "23--28",
editor = "Hanghang Tong and Zhenhui Li and Feida Zhu and Jeffrey Yu",
booktitle = "Proceedings - 18th IEEE International Conference on Data Mining Workshops, ICDMW 2018",
address = "United States",
}