Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
CSVParser |
|
| 3.727272727272727;3.727 |
1 | package au.com.bytecode.opencsv; | |
2 | ||
3 | /** | |
4 | Copyright 2005 Bytecode Pty Ltd. | |
5 | ||
6 | Licensed under the Apache License, Version 2.0 (the "License"); | |
7 | you may not use this file except in compliance with the License. | |
8 | You may obtain a copy of the License at | |
9 | ||
10 | http://www.apache.org/licenses/LICENSE-2.0 | |
11 | ||
12 | Unless required by applicable law or agreed to in writing, software | |
13 | distributed under the License is distributed on an "AS IS" BASIS, | |
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
15 | See the License for the specific language governing permissions and | |
16 | limitations under the License. | |
17 | */ | |
18 | ||
19 | import java.io.IOException; | |
20 | import java.util.ArrayList; | |
21 | import java.util.List; | |
22 | ||
23 | /** | |
24 | * A very simple CSV parser released under a commercial-friendly license. | |
25 | * This just implements splitting a single line into fields. | |
26 | * | |
27 | * @author Glen Smith | |
28 | * @author Rainer Pruy | |
29 | * | |
30 | */ | |
31 | public class CSVParser { | |
32 | ||
33 | private final char separator; | |
34 | ||
35 | private final char quotechar; | |
36 | ||
37 | private final char escape; | |
38 | ||
39 | private final boolean strictQuotes; | |
40 | ||
41 | private String pending; | |
42 | ||
43 | /** The default separator to use if none is supplied to the constructor. */ | |
44 | public static final char DEFAULT_SEPARATOR = ','; | |
45 | ||
46 | public static final int INITIAL_READ_SIZE = 128; | |
47 | ||
48 | /** | |
49 | * The default quote character to use if none is supplied to the | |
50 | * constructor. | |
51 | */ | |
52 | public static final char DEFAULT_QUOTE_CHARACTER = '"'; | |
53 | ||
54 | ||
55 | /** | |
56 | * The default escape character to use if none is supplied to the | |
57 | * constructor. | |
58 | */ | |
59 | public static final char DEFAULT_ESCAPE_CHARACTER = '\\'; | |
60 | ||
61 | /** | |
62 | * The default strict quote behavior to use if none is supplied to the | |
63 | * constructor | |
64 | */ | |
65 | public static final boolean DEFAULT_STRICT_QUOTES = false; | |
66 | ||
67 | /** | |
68 | * Constructs CSVParser using a comma for the separator. | |
69 | */ | |
70 | public CSVParser() { | |
71 | 0 | this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER); |
72 | 0 | } |
73 | ||
74 | /** | |
75 | * Constructs CSVParser with supplied separator. | |
76 | * @param separator | |
77 | * the delimiter to use for separating entries. | |
78 | */ | |
79 | public CSVParser(char separator) { | |
80 | 0 | this(separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER); |
81 | 0 | } |
82 | ||
83 | ||
84 | ||
85 | /** | |
86 | * Constructs CSVParser with supplied separator and quote char. | |
87 | * @param separator | |
88 | * the delimiter to use for separating entries | |
89 | * @param quotechar | |
90 | * the character to use for quoted elements | |
91 | */ | |
92 | public CSVParser(char separator, char quotechar) { | |
93 | 0 | this(separator, quotechar, DEFAULT_ESCAPE_CHARACTER); |
94 | 0 | } |
95 | ||
96 | /** | |
97 | * Constructs CSVReader with supplied separator and quote char. | |
98 | * @param separator | |
99 | * the delimiter to use for separating entries | |
100 | * @param quotechar | |
101 | * the character to use for quoted elements | |
102 | * @param escape | |
103 | * the character to use for escaping a separator or quote | |
104 | */ | |
105 | public CSVParser(char separator, char quotechar, char escape) { | |
106 | 0 | this(separator, quotechar, escape, DEFAULT_STRICT_QUOTES); |
107 | 0 | } |
108 | ||
109 | /** | |
110 | * Constructs CSVReader with supplied separator and quote char. | |
111 | * Allows setting the "strict quotes" flag | |
112 | * @param separator | |
113 | * the delimiter to use for separating entries | |
114 | * @param quotechar | |
115 | * the character to use for quoted elements | |
116 | * @param escape | |
117 | * the character to use for escaping a separator or quote | |
118 | * @param strictQuotes | |
119 | * if true, characters outside the quotes are ignored | |
120 | */ | |
121 | 0 | public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes) { |
122 | 0 | this.separator = separator; |
123 | 0 | this.quotechar = quotechar; |
124 | 0 | this.escape = escape; |
125 | 0 | this.strictQuotes = strictQuotes; |
126 | 0 | } |
127 | ||
128 | /** | |
129 | * | |
130 | * @return true if something was left over from last call(s) | |
131 | */ | |
132 | public boolean isPending() { | |
133 | 0 | return pending != null; |
134 | } | |
135 | ||
136 | public String[] parseLineMulti(String nextLine) throws IOException { | |
137 | 0 | return parseLine(nextLine, true); |
138 | } | |
139 | ||
140 | public String[] parseLine(String nextLine) throws IOException { | |
141 | 0 | return parseLine(nextLine, false); |
142 | } | |
143 | /** | |
144 | * Parses an incoming String and returns an array of elements. | |
145 | * | |
146 | * @param nextLine | |
147 | * the string to parse | |
148 | * @param multi | |
149 | * @return the comma-tokenized list of elements, or null if nextLine is null | |
150 | * @throws IOException if bad things happen during the read | |
151 | */ | |
152 | private String[] parseLine(String nextLine, boolean multi) throws IOException { | |
153 | ||
154 | 0 | if (!multi && pending != null) { |
155 | 0 | pending = null; |
156 | } | |
157 | ||
158 | 0 | if (nextLine == null) { |
159 | 0 | if (pending != null) { |
160 | 0 | String s = pending; |
161 | 0 | pending = null; |
162 | 0 | return new String[] {s}; |
163 | } else { | |
164 | 0 | return null; |
165 | } | |
166 | } | |
167 | ||
168 | 0 | List<String>tokensOnThisLine = new ArrayList<String>(); |
169 | 0 | StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE); |
170 | 0 | boolean inQuotes = false; |
171 | 0 | if (pending != null) { |
172 | 0 | sb.append(pending); |
173 | 0 | pending = null; |
174 | 0 | inQuotes = true; |
175 | } | |
176 | 0 | for (int i = 0; i < nextLine.length(); i++) { |
177 | ||
178 | 0 | char c = nextLine.charAt(i); |
179 | 0 | if (c == this.escape) { |
180 | 0 | if( isNextCharacterEscapable(nextLine, inQuotes, i) ){ |
181 | 0 | sb.append(nextLine.charAt(i+1)); |
182 | 0 | i++; |
183 | } | |
184 | 0 | } else if (c == quotechar) { |
185 | 0 | if( isNextCharacterEscapedQuote(nextLine, inQuotes, i) ){ |
186 | 0 | sb.append(nextLine.charAt(i+1)); |
187 | 0 | i++; |
188 | }else{ | |
189 | 0 | inQuotes = !inQuotes; |
190 | // the tricky case of an embedded quote in the middle: a,bc"d"ef,g | |
191 | 0 | if (!strictQuotes) { |
192 | 0 | if(i>2 //not on the beginning of the line |
193 | && nextLine.charAt(i-1) != this.separator //not at the beginning of an escape sequence | |
194 | && nextLine.length()>(i+1) && | |
195 | nextLine.charAt(i+1) != this.separator //not at the end of an escape sequence | |
196 | ){ | |
197 | 0 | sb.append(c); |
198 | } | |
199 | } | |
200 | } | |
201 | 0 | } else if (c == separator && !inQuotes) { |
202 | 0 | tokensOnThisLine.add(sb.toString()); |
203 | 0 | sb = new StringBuilder(INITIAL_READ_SIZE); // start work on next token |
204 | } else { | |
205 | 0 | if (!strictQuotes || inQuotes) |
206 | 0 | sb.append(c); |
207 | } | |
208 | } | |
209 | // line is done - check status | |
210 | 0 | if (inQuotes) { |
211 | 0 | if (multi) { |
212 | // continuing a quoted section, re-append newline | |
213 | 0 | sb.append("\n"); |
214 | 0 | pending = sb.toString(); |
215 | 0 | sb = null; // this partial content is not to be added to field list yet |
216 | } else { | |
217 | 0 | throw new IOException("Un-terminated quoted field at end of CSV line"); |
218 | } | |
219 | } | |
220 | 0 | if (sb != null) { |
221 | 0 | tokensOnThisLine.add(sb.toString()); |
222 | } | |
223 | 0 | return tokensOnThisLine.toArray(new String[tokensOnThisLine.size()]); |
224 | ||
225 | } | |
226 | ||
227 | /** | |
228 | * precondition: the current character is a quote or an escape | |
229 | * @param nextLine the current line | |
230 | * @param inQuotes true if the current context is quoted | |
231 | * @param i current index in line | |
232 | * @return true if the following character is a quote | |
233 | */ | |
234 | private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) { | |
235 | 0 | return inQuotes // we are in quotes, therefore there can be escaped quotes in here. |
236 | && nextLine.length() > (i+1) // there is indeed another character to check. | |
237 | && nextLine.charAt(i+1) == quotechar; | |
238 | } | |
239 | ||
240 | /** | |
241 | * precondition: the current character is an escape | |
242 | * @param nextLine the current line | |
243 | * @param inQuotes true if the current context is quoted | |
244 | * @param i current index in line | |
245 | * @return true if the following character is a quote | |
246 | */ | |
247 | protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) { | |
248 | 0 | return inQuotes // we are in quotes, therefore there can be escaped quotes in here. |
249 | && nextLine.length() > (i+1) // there is indeed another character to check. | |
250 | && ( nextLine.charAt(i+1) == quotechar || nextLine.charAt(i+1) == this.escape); | |
251 | } | |
252 | } |