libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
proteinintegercode.cpp
Go to the documentation of this file.
1/**
2 * \file protein/proteinintegercode.cpp
3 * \date 22/05/2023
4 * \author Olivier Langella
5 * \brief transform protein amino acid sequence into vectors of amino acid codes
6 */
7
8
9/*******************************************************************************
10 * Copyright (c) 2023 Olivier Langella
11 *<Olivier.Langella@universite-paris-saclay.fr>.
12 *
13 * This file is part of PAPPSOms++.
14 *
15 * PAPPSOms++ is free software: you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation, either version 3 of the License, or
18 * (at your option) any later version.
19 *
20 * PAPPSOms++ is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with PAPPSOms++. If not, see <http://www.gnu.org/licenses/>.
27 *
28 ******************************************************************************/
29
30#include "proteinintegercode.h"
31#include "../exception/exceptionoutofrange.h"
32
33using namespace pappso;
34
36 const AaStringCodec &codec,
37 std::size_t aa_str_max_size)
38{
39 msp_protein = protein;
40
41 if(aa_str_max_size > 7)
42 {
44 QObject::tr("aa_str_max_size exceeds max size"));
45 }
46
47 QString seq_str = protein.get()->getSequence();
48 m_seqAaCode.clear();
49
50 for(const QChar &aa_str : seq_str)
51 {
52 m_seqAaCode.push_back(codec.getAaCode().getAaCode(aa_str.toLatin1()));
53 }
54
55 for(std::size_t i = 2; i <= aa_str_max_size; i++)
56 {
58 }
59}
60
69
73const std::vector<std::uint8_t> &
75{
76 return m_seqAaCode;
77}
78
81{
82 return msp_protein;
83}
84
85
86std::vector<std::uint32_t>
88 const AaStringCodec &codec, std::size_t fragment_size) const
89{
90 std::vector<std::uint32_t> fragments;
91
92 int max = (m_seqAaCode.size() - fragment_size);
93 if(max < 0)
94 return fragments;
95
96 auto it = m_seqAaCode.begin();
97 for(int i = 0; i <= max; i++)
98 {
99 fragments.push_back(codec.codeLlc(it, fragment_size));
100 it++;
101 }
102
103 return fragments;
104}
105
106
107const std::vector<std::uint32_t> &
109{
110 if(size < 2)
111 {
112
113 throw ExceptionOutOfRange(QObject::tr("size too small"));
114 }
115 std::size_t indice = size - 2;
116 if(indice < m_peptideCodedFragments.size())
117 {
118 return m_peptideCodedFragments.at(indice);
119 }
120
121 throw ExceptionOutOfRange(QObject::tr("size too big"));
122}
123
124
125std::vector<std::pair<std::size_t, std::uint32_t>>
127 const std::vector<uint32_t> &code_list_in) const
128{
129 std::vector<std::pair<std::size_t, std::uint32_t>> return_pos;
130 std::vector<uint32_t> code_list = code_list_in;
131
132 std::sort(code_list.begin(), code_list.end());
133 auto it_end = std::unique(code_list.begin(), code_list.end());
134 for(auto it_code = code_list.begin(); it_code != it_end; it_code++)
135 {
136
137 std::size_t size = 2;
138 for(auto &liste_protein_seq_code : m_peptideCodedFragments)
139 {
140
141 auto it_seq_position = std::find(liste_protein_seq_code.begin(),
142 liste_protein_seq_code.end(),
143 *it_code);
144 while(it_seq_position != liste_protein_seq_code.end())
145 {
146 // found
147 std::size_t position =
148 std::distance(liste_protein_seq_code.begin(), it_seq_position);
149 return_pos.push_back({size, position});
150
151 it_seq_position = std::find(
152 ++it_seq_position, liste_protein_seq_code.end(), *it_code);
153 qDebug();
154 }
155 size++;
156 qDebug();
157 }
158 qDebug();
159 }
160
161 return return_pos;
162}
163
164std::vector<double>
166 const std::vector<uint32_t> &code_list_from_spectrum) const
167{
168 std::vector<double> convolution_score;
169
170
171 std::vector<std::uint8_t>::const_iterator it_aa = m_seqAaCode.begin();
172 auto it_couple = m_peptideCodedFragments[0].begin();
173 auto it_trio = m_peptideCodedFragments[1].begin();
174 auto it_quatro = m_peptideCodedFragments[2].begin();
175 auto it_cinqo = m_peptideCodedFragments[3].begin();
176 for(std::uint8_t aa_code : m_seqAaCode)
177 {
178 convolution_score.push_back(convolutionKernel(code_list_from_spectrum,
179 it_aa,
180 it_couple,
181 it_trio,
182 it_quatro,
183 it_cinqo));
184 it_aa++;
185 it_couple++;
186 it_trio++;
187 it_quatro++;
188 it_cinqo++;
189 }
190
191 return convolution_score;
192}
193
194double
196 const std::vector<uint32_t> &spectrum_code_list,
197 std::vector<std::uint8_t>::const_iterator it_aa,
198 std::vector<std::uint32_t>::const_iterator it_couple,
199 std::vector<std::uint32_t>::const_iterator it_trio,
200 std::vector<std::uint32_t>::const_iterator it_quatro,
201 std::vector<std::uint32_t>::const_iterator it_cinqo) const
202{
203 double score = 0;
204
205 // single :
206 auto it_end = it_aa + 5;
207 auto it = it_aa;
208 double single_score = 0;
209 while(it != it_end)
210 {
211 if(std::binary_search(spectrum_code_list.begin(),
212 spectrum_code_list.end(),
213 (std::uint32_t)(*it)))
214 {
215 single_score += 1;
216 }
217 else
218 {
219 single_score += 0.1;
220 }
221 it++;
222 }
223
224 // duo
225 auto itduo_end = it_couple + 4;
226 auto itduo = it_couple;
227 double duo_score = 0;
228 while(itduo != itduo_end)
229 {
230 if(std::binary_search(
231 spectrum_code_list.begin(), spectrum_code_list.end(), *itduo))
232 {
233 duo_score += 3;
234 }
235 else
236 {
237 duo_score += 0.2;
238 }
239 itduo++;
240 }
241
242
243 // trio
244 auto it3_end = it_trio + 3;
245 auto it3 = it_trio;
246 double trio_score = 0;
247 while(it3 != it3_end)
248 {
249 if(std::binary_search(
250 spectrum_code_list.begin(), spectrum_code_list.end(), *it3))
251 {
252 trio_score += 6;
253 }
254 else
255 {
256 trio_score += 0.5;
257 }
258 it3++;
259 }
260
261
262 // quatro
263 auto it4_end = it_quatro + 2;
264 auto it4 = it_quatro;
265 double quatro_score = 0;
266 while(it4 != it4_end)
267 {
268 // element < value
269 if(std::binary_search(
270 spectrum_code_list.begin(), spectrum_code_list.end(), *it4))
271 {
272 quatro_score += 8;
273 }
274 quatro_score += 0.8;
275 it4++;
276 }
277
278 // cinqo
279 if(std::binary_search(
280 spectrum_code_list.begin(), spectrum_code_list.end(), *it_cinqo))
281 {
282 score += 10;
283 }
284 else
285 {
286 score += 1;
287 }
288 return score * single_score * duo_score * trio_score * quatro_score;
289}
uint8_t getAaCode(char aa_letter) const
Definition aacode.cpp:81
const AaCode & getAaCode() const
uint32_t codeLlc(const QString &aa_str) const
get the lowest common denominator integer from amino acide suite string
const std::vector< std::uint32_t > & getPeptideCodedFragment(std::size_t size) const
std::vector< std::pair< std::size_t, std::uint32_t > > match(const std::vector< uint32_t > &code_list) const
list of positions and matched codes along protein sequence
std::vector< double > convolution(const std::vector< uint32_t > &code_list_from_spectrum) const
process convolution of spectrum code list along protein sequence
const std::vector< std::uint8_t > & getSeqAaCode() const
std::vector< std::uint8_t > m_seqAaCode
double convolutionKernel(const std::vector< uint32_t > &spectrum_code_list, std::vector< std::uint8_t >::const_iterator it_aa, std::vector< std::uint32_t >::const_iterator it_couple, std::vector< std::uint32_t >::const_iterator it_trio, std::vector< std::uint32_t >::const_iterator it_quatro, std::vector< std::uint32_t >::const_iterator it_cinqo) const
ProteinIntegerCode(ProteinSp protein, const AaStringCodec &codec, std::size_t aa_str_max_size=5)
std::vector< std::vector< std::uint32_t > > m_peptideCodedFragments
std::vector< std::uint32_t > computePeptideCodeFragments(const AaStringCodec &codec, std::size_t fragment_size) const
pappso::ProteinSp getProteinSp() const
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
std::shared_ptr< const Protein > ProteinSp
shared pointer on a Protein object
Definition protein.h:47
@ max
maximum of intensities
transform protein amino acid sequence into vectors of amino acid codes