@inproceedings{19ce43891b3b4cdfbcd62a3813e27a26,
title = "Machine Learning Approach to Assign Protein Secondary Structure Elements from Ca Trace",
abstract = "Secondary structure elements in protein molecules refer to local sub-conformational regions stabilized by hydrogen bonding. Secondary structure elements can be divided into helical, sheet, or loop. Secondary structure elements bolster the folding and topology of the protein. They are important for modern structural bioinformatics such as protein modeling and functional analysis. Therefore, assigning the types of secondary structures in proteins is crucial. Many methods have been developed to address the problem. Methods can be categorized into two approaches. One approach uses the information about hydrogen bonding and energy while the other approach uses protein trace geometry. If the information of some atoms is missing, the second approach is more feasible. In this paper, we develop a machine learning method that belongs to the second approach to assign secondary structure elements. We develop a 3-state machine learning classifier. The classifier uses protein's Ca information only. The classifier ensembles four (4) machine learning models: Random Forest, Support Vector Machine, Multilayer Perceptron, and eXtreme Gradient Boosting. The classifier is trained with 600K amino acids. We tested our classifier at two different data sets. One data set contains 150K amino acids. The accuracy of our system was 94.6%. In addition, the classifier was tested on a set of 20 protein structures and compared with PCASSO from the same category. The information from Protein Data Bank was used as a reference. The comparison shows that our method can produce assignments that are more aligned with PDB at 93% accuracy while PCASSO achieved S4% accuracy.",
keywords = "Ca backbone, chain trace, protein, protein modeling, secondary structure assignment, secondary structure prediction",
author = "Sallal, {Mohammad Al} and Wei Chen and Nasr, {Kamal Al}",
note = "Funding Information: ACKNOWLEDGMENT This work was supported by the NIH Academic Research Enhancement Award (R15 AREA: 1R15GM126509 01). Publisher Copyright: {\textcopyright} 2020 IEEE.; 2020 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2020 ; Conference date: 16-12-2020 Through 19-12-2020",
year = "2020",
month = dec,
day = "16",
doi = "10.1109/BIBM49941.2020.9313137",
language = "English (US)",
series = "Proceedings - 2020 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2020",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "35--41",
editor = "Taesung Park and Young-Rae Cho and Hu, {Xiaohua Tony} and Illhoi Yoo and Woo, {Hyun Goo} and Jianxin Wang and Julio Facelli and Seungyoon Nam and Mingon Kang",
booktitle = "Proceedings - 2020 IEEE International Conference on Bioinformatics and Biomedicine, BIBM 2020",
}