View Javadoc

1   package au.com.bytecode.opencsv;
2   
3   /**
4    Copyright 2005 Bytecode Pty Ltd.
5   
6    Licensed under the Apache License, Version 2.0 (the "License");
7    you may not use this file except in compliance with the License.
8    You may obtain a copy of the License at
9   
10   http://www.apache.org/licenses/LICENSE-2.0
11  
12   Unless required by applicable law or agreed to in writing, software
13   distributed under the License is distributed on an "AS IS" BASIS,
14   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   See the License for the specific language governing permissions and
16   limitations under the License.
17   */
18  
19  import java.io.IOException;
20  import java.util.ArrayList;
21  import java.util.List;
22  
23  /**
24   * A very simple CSV parser released under a commercial-friendly license.
25   * This just implements splitting a single line into fields.
26   * 
27   * @author Glen Smith
28   * @author Rainer Pruy
29   * 
30   */
31  public class CSVParser {
32  
33      private final char separator;
34  
35      private final char quotechar;
36      
37      private final char escape;
38  
39      private final boolean strictQuotes;
40  
41      private String pending;
42      
43      /** The default separator to use if none is supplied to the constructor. */
44      public static final char DEFAULT_SEPARATOR = ',';
45  
46      public static final int INITIAL_READ_SIZE = 128;
47  
48      /**
49       * The default quote character to use if none is supplied to the
50       * constructor.
51       */
52      public static final char DEFAULT_QUOTE_CHARACTER = '"';
53      
54  
55      /**
56       * The default escape character to use if none is supplied to the
57       * constructor.
58       */
59      public static final char DEFAULT_ESCAPE_CHARACTER = '\\';
60  
61     /**
62      * The default strict quote behavior to use if none is supplied to the
63      * constructor
64      */
65      public static final boolean DEFAULT_STRICT_QUOTES = false;
66      
67      /**
68       * Constructs CSVParser using a comma for the separator.
69       */
70      public CSVParser() {
71          this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
72      }
73  
74      /**
75       * Constructs CSVParser with supplied separator.
76       * @param separator
77       *            the delimiter to use for separating entries.
78       */
79      public CSVParser(char separator) {
80          this(separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
81      }
82      
83      
84  
85      /**
86       * Constructs CSVParser with supplied separator and quote char.
87       * @param separator
88       *            the delimiter to use for separating entries
89       * @param quotechar
90       *            the character to use for quoted elements
91       */
92      public CSVParser(char separator, char quotechar) {
93          this(separator, quotechar, DEFAULT_ESCAPE_CHARACTER);
94      }
95  
96      /**
97       * Constructs CSVReader with supplied separator and quote char.
98       * @param separator
99       *            the delimiter to use for separating entries
100      * @param quotechar
101      *            the character to use for quoted elements
102      * @param escape
103      *            the character to use for escaping a separator or quote
104      */
105     public CSVParser(char separator, char quotechar, char escape) {
106         this(separator, quotechar, escape, DEFAULT_STRICT_QUOTES);
107     }
108 
109     /**
110      * Constructs CSVReader with supplied separator and quote char.
111      * Allows setting the "strict quotes" flag
112      * @param separator
113      *            the delimiter to use for separating entries
114      * @param quotechar
115      *            the character to use for quoted elements
116      * @param escape
117      *            the character to use for escaping a separator or quote
118      * @param strictQuotes
119      *            if true, characters outside the quotes are ignored
120      */
121     public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes) {
122         this.separator = separator;
123         this.quotechar = quotechar;
124         this.escape = escape;
125         this.strictQuotes = strictQuotes;
126     }
127     
128     /**
129      * 
130      * @return true if something was left over from last call(s)
131      */
132     public boolean isPending() {
133     	return pending != null;
134     }
135 
136     public String[] parseLineMulti(String nextLine) throws IOException {
137     	return parseLine(nextLine, true);
138     }
139     
140     public String[]  parseLine(String nextLine) throws IOException {
141     	return parseLine(nextLine, false);
142     }
143     /**
144      * Parses an incoming String and returns an array of elements.
145      * 
146      * @param nextLine
147      *            the string to parse
148      * @param multi
149      * @return the comma-tokenized list of elements, or null if nextLine is null
150      * @throws IOException if bad things happen during the read
151      */
152     private String[] parseLine(String nextLine, boolean multi) throws IOException {
153 
154     	if (!multi && pending != null) {
155     		pending = null;
156     	}
157     	
158         if (nextLine == null) {
159         	if (pending != null) {
160         		String s = pending;
161         		pending = null;
162         		return new String[] {s};
163         	} else {
164         		return null;
165         	}
166         }
167 
168         List<String>tokensOnThisLine = new ArrayList<String>();
169         StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE);
170         boolean inQuotes = false;
171         if (pending != null) {
172         	sb.append(pending);
173         	pending = null;
174         	inQuotes = true;
175         }
176         for (int i = 0; i < nextLine.length(); i++) {
177         	
178         	char c = nextLine.charAt(i);
179         	if (c == this.escape) {
180         		if( isNextCharacterEscapable(nextLine, inQuotes, i) ){
181         			sb.append(nextLine.charAt(i+1));
182         			i++;
183         		} 
184         	} else if (c == quotechar) {
185         		if( isNextCharacterEscapedQuote(nextLine, inQuotes, i) ){
186         			sb.append(nextLine.charAt(i+1));
187         			i++;
188         		}else{
189         			inQuotes = !inQuotes;
190         			// the tricky case of an embedded quote in the middle: a,bc"d"ef,g
191                     if (!strictQuotes) {
192                         if(i>2 //not on the beginning of the line
193                                 && nextLine.charAt(i-1) != this.separator //not at the beginning of an escape sequence
194                                 && nextLine.length()>(i+1) &&
195                                 nextLine.charAt(i+1) != this.separator //not at the	end of an escape sequence
196                         ){
197                             sb.append(c);
198                         }
199                     }
200         		}
201         	} else if (c == separator && !inQuotes) {
202         		tokensOnThisLine.add(sb.toString());
203         		sb = new StringBuilder(INITIAL_READ_SIZE); // start work on next token
204         	} else {
205                 if (!strictQuotes || inQuotes)
206                     sb.append(c);
207         	}
208         }
209         // line is done - check status
210         if (inQuotes) {
211         	if (multi) {
212         		// continuing a quoted section, re-append newline
213         		sb.append("\n");
214         		pending = sb.toString();
215         		sb = null; // this partial content is not to be added to field list yet
216         	} else {
217         		throw new IOException("Un-terminated quoted field at end of CSV line");
218         	}
219         }
220         if (sb != null) {
221         	tokensOnThisLine.add(sb.toString());
222         }
223         return tokensOnThisLine.toArray(new String[tokensOnThisLine.size()]);
224         
225     }
226     
227     /**  
228      * precondition: the current character is a quote or an escape
229 	 * @param nextLine the current line
230 	 * @param inQuotes true if the current context is quoted
231 	 * @param i current index in line
232 	 * @return true if the following character is a quote
233 	 */
234 	private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
235 		return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
236 		    && nextLine.length() > (i+1)  // there is indeed another character to check.
237 		    && nextLine.charAt(i+1) == quotechar;
238 	}
239 
240 	/**  
241 	 * precondition: the current character is an escape
242 	 * @param nextLine the current line
243 	 * @param inQuotes true if the current context is quoted
244 	 * @param i current index in line
245 	 * @return true if the following character is a quote
246 	 */
247 	protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
248 		return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
249 		    && nextLine.length() > (i+1)  // there is indeed another character to check.
250 		    && ( nextLine.charAt(i+1) == quotechar || nextLine.charAt(i+1) == this.escape);
251 	}
252 }