1
2
3
4 package net.sourceforge.pmd.cpd;
5
6 import java.util.List;
7
8
9
10
11
12
13 public abstract class AbstractTokenizer implements Tokenizer {
14
15
16
17 protected List<String> stringToken;
18 protected List<String> ignorableCharacter;
19
20 protected List<String> ignorableStmt;
21 protected char oneLineCommentChar = '#';
22
23 private List<String> code;
24 private int lineNumber = 0;
25 private String currentLine;
26
27 protected boolean spanMultipleLinesString = true;
28
29 private boolean downcaseString = true;
30
31 public void tokenize(SourceCode tokens, Tokens tokenEntries) {
32 code = tokens.getCode();
33
34 for ( lineNumber = 0; lineNumber < code.size(); lineNumber++ ) {
35 currentLine = code.get(lineNumber);
36 int loc = 0;
37 while ( loc < currentLine.length() ) {
38 StringBuilder token = new StringBuilder();
39 loc = getTokenFromLine(token,loc);
40 if (token.length() > 0 && !isIgnorableString(token.toString())) {
41 if (downcaseString) {
42 token = new StringBuilder(token.toString().toLowerCase());
43 }
44
45
46
47
48 tokenEntries.add(new TokenEntry(token.toString(),
49 tokens.getFileName(),
50 lineNumber)
51 );
52
53 }
54 }
55 }
56 tokenEntries.add(TokenEntry.getEOF());
57 }
58
59 private int getTokenFromLine(StringBuilder token, int loc) {
60 for (int j = loc; j < currentLine.length(); j++) {
61 char tok = currentLine.charAt(j);
62 if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
63 if (isComment(tok)) {
64 if (token.length() > 0) {
65 return j;
66 } else {
67 return getCommentToken(token, loc);
68 }
69 } else if (isString(tok)) {
70 if (token.length() > 0) {
71 return j;
72 } else {
73
74 return parseString(token, j, tok);
75 }
76 } else {
77 token.append(tok);
78 }
79 } else {
80 if (token.length() > 0) {
81 return j;
82 }
83 }
84 loc = j;
85 }
86 return loc + 1;
87 }
88
89 private int parseString(StringBuilder token, int loc, char stringDelimiter) {
90 boolean escaped = false;
91 boolean done = false;
92 char tok = ' ';
93 while ((loc < currentLine.length()) && ! done) {
94 tok = currentLine.charAt(loc);
95 if (escaped && tok == stringDelimiter) {
96 escaped = false;
97 } else if (tok == stringDelimiter && (token.length() > 0)) {
98 done = true;
99 } else if (tok == '\\') {
100 escaped = true;
101 } else {
102 escaped = false;
103 }
104
105 token.append(tok);
106 loc++;
107 }
108
109 if ( ! done &&
110 loc >= currentLine.length() &&
111 spanMultipleLinesString &&
112 lineNumber < code.size() - 1
113 ) {
114
115 currentLine = code.get(++lineNumber);
116
117 loc = parseString(token, loc, stringDelimiter);
118 }
119 return loc + 1;
120 }
121
122 private boolean ignoreCharacter(char tok)
123 {
124 return ignorableCharacter.contains(String.valueOf(tok));
125 }
126
127 private boolean isString(char tok)
128 {
129 return stringToken.contains(String.valueOf(tok));
130 }
131
132 private boolean isComment(char tok)
133 {
134 return tok == oneLineCommentChar;
135 }
136
137 private int getCommentToken(StringBuilder token, int loc)
138 {
139 while (loc < currentLine.length())
140 {
141 token.append(currentLine.charAt(loc++));
142 }
143 return loc;
144 }
145
146 private boolean isIgnorableString(String token)
147 {
148 return ignorableStmt.contains(token);
149 }
150 }