001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *  http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing,
013     * software distributed under the License is distributed on an
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015     * KIND, either express or implied.  See the License for the
016     * specific language governing permissions and limitations
017     * under the License.
018     */
019    
020    package org.apache.geronimo.mail.util;
021    
022    import java.io.BufferedInputStream;
023    import java.io.InputStream;
024    import java.io.IOException;
025    
026    
027    /**
028     * Set of utility classes for handling common encoding-related
029     * manipulations.
030     *
031     * @version $Rev: 467553 $ $Date: 2006-10-25 06:01:51 +0200 (Mi, 25. Okt 2006) $
032     */
033    public class ASCIIUtil {
034        private static final String MIME_FOLDTEXT = "mail.mime.foldtext";
035        private static final int FOLD_THRESHOLD = 76;
036    
037        /**
038         * Test to see if this string contains only US-ASCII (i.e., 7-bit
039         * ASCII) charactes.
040         *
041         * @param s      The test string.
042         *
043         * @return true if this is a valid 7-bit ASCII encoding, false if it
044         *         contains any non-US ASCII characters.
045         */
046        static public boolean isAscii(String s) {
047            for (int i = 0; i < s.length(); i++) {
048                if (!isAscii(s.charAt(i))) {
049                    return false;
050                }
051            }
052            return true;
053        }
054    
055        /**
056         * Test to see if a given character can be considered "valid" ASCII.
057         * The excluded characters are the control characters less than
058         * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
059         * tab characters ARE considered value (all less than 32).
060         *
061         * @param ch     The test character.
062         *
063         * @return true if this character meets the "ascii-ness" criteria, false
064         *         otherwise.
065         */
066        static public boolean isAscii(int ch) {
067            // these are explicitly considered valid.
068            if (ch == '\r' || ch == '\n' || ch == '\t') {
069                return true;
070            }
071    
072            // anything else outside the range is just plain wrong.
073            if (ch >= 127 || ch < 32) {
074                return false;
075            }
076            return true;
077        }
078    
079    
080        /**
081         * Examine a stream of text and make a judgement on what encoding
082         * type should be used for the text.  Ideally, we want to use 7bit
083         * encoding to determine this, but we may need to use either quoted-printable
084         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
085         *
086         * @param content     An input stream for the content we're examining.
087         *
088         * @exception IOException
089         */
090        public static String getTextTransferEncoding(InputStream content) throws IOException {
091    
092            // for efficiency, we'll read in blocks.
093            BufferedInputStream in = new BufferedInputStream(content, 4096);
094    
095            int span = 0;            // span of characters without a line break.
096            boolean containsLongLines = false;
097            int asciiChars = 0;
098            int nonAsciiChars = 0;
099    
100            while (true) {
101                int ch = in.read();
102                // if we hit an EOF here, go decide what type we've actually found.
103                if (ch == -1) {
104                    break;
105                }
106    
107                // we found a linebreak.  Reset the line length counters on either one.  We don't
108                // really need to validate here.
109                if (ch == '\n' || ch == '\r') {
110                    // hit a line end, reset our line length counter
111                    span = 0;
112                }
113                else {
114                    span++;
115                    // the text has long lines, we can't transfer this as unencoded text.
116                    if (span > 998) {
117                        containsLongLines = true;
118                    }
119    
120                    // non-ascii character, we have to transfer this in binary.
121                    if (!isAscii(ch)) {
122                        nonAsciiChars++;
123                    }
124                    else {
125                        asciiChars++;
126                    }
127                }
128            }
129    
130            // looking good so far, only valid chars here.
131            if (nonAsciiChars == 0) {
132                // does this contain long text lines?  We need to use a Q-P encoding which will
133                // be only slightly longer, but handles folding the longer lines.
134                if (containsLongLines) {
135                    return "quoted-printable";
136                }
137                else {
138                    // ideal!  Easiest one to handle.
139                    return "7bit";
140                }
141            }
142            else {
143                // mostly characters requiring encoding?  Base64 is our best bet.
144                if (nonAsciiChars > asciiChars) {
145                    return "base64";
146                }
147                else {
148                    // Q-P encoding will use fewer bytes than the full Base64.
149                    return "quoted-printable";
150                }
151            }
152        }
153    
154    
155        /**
156         * Examine a stream of text and make a judgement on what encoding
157         * type should be used for the text.  Ideally, we want to use 7bit
158         * encoding to determine this, but we may need to use either quoted-printable
159         * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
160         *
161         * @param content     A string for the content we're examining.
162         */
163        public static String getTextTransferEncoding(String content) {
164    
165            int asciiChars = 0;
166            int nonAsciiChars = 0;
167    
168            for (int i = 0; i < content.length(); i++) {
169                int ch = content.charAt(i);
170    
171                // non-ascii character, we have to transfer this in binary.
172                if (!isAscii(ch)) {
173                    nonAsciiChars++;
174                }
175                else {
176                    asciiChars++;
177                }
178            }
179    
180            // looking good so far, only valid chars here.
181            if (nonAsciiChars == 0) {
182                // ideal!  Easiest one to handle.
183                return "7bit";
184            }
185            else {
186                // mostly characters requiring encoding?  Base64 is our best bet.
187                if (nonAsciiChars > asciiChars) {
188                    return "base64";
189                }
190                else {
191                    // Q-P encoding will use fewer bytes than the full Base64.
192                    return "quoted-printable";
193                }
194            }
195        }
196    
197    
198        /**
199         * Determine if the transfer encoding looks like it might be
200         * valid ascii text, and thus transferable as 7bit code.  In
201         * order for this to be true, all characters must be valid
202         * 7-bit ASCII code AND all line breaks must be properly formed
203         * (JUST '\r\n' sequences).  7-bit transfers also
204         * typically have a line limit of 1000 bytes (998 + the CRLF), so any
205         * stretch of charactes longer than that will also force Base64 encoding.
206         *
207         * @param content     An input stream for the content we're examining.
208         *
209         * @exception IOException
210         */
211        public static String getBinaryTransferEncoding(InputStream content) throws IOException {
212    
213            // for efficiency, we'll read in blocks.
214            BufferedInputStream in = new BufferedInputStream(content, 4096);
215    
216            int previousChar = 0;
217            int span = 0;            // span of characters without a line break.
218    
219            while (true) {
220                int ch = in.read();
221                // if we hit an EOF here, we've only found valid text so far, so we can transfer this as
222                // 7-bit ascii.
223                if (ch == -1) {
224                    return "7bit";
225                }
226    
227                // we found a newline, this is only valid if the previous char was the '\r'
228                if (ch == '\n') {
229                    // malformed linebreak?  force this to base64 encoding.
230                    if (previousChar != '\r') {
231                        return "base64";
232                    }
233                    // hit a line end, reset our line length counter
234                    span = 0;
235                }
236                else {
237                    span++;
238                    // the text has long lines, we can't transfer this as unencoded text.
239                    if (span > 998) {
240                        return "base64";
241                    }
242    
243                    // non-ascii character, we have to transfer this in binary.
244                    if (!isAscii(ch)) {
245                        return "base64";
246                    }
247                }
248                previousChar = ch;
249            }
250        }
251    
252    
253        /**
254         * Perform RFC 2047 text folding on a string of text.
255         *
256         * @param used   The amount of text already "used up" on this line.  This is
257         *               typically the length of a message header that this text
258         *               get getting added to.
259         * @param s      The text to fold.
260         *
261         * @return The input text, with linebreaks inserted at appropriate fold points.
262         */
263        public static String fold(int used, String s) {
264            // if folding is disable, unfolding is also.  Return the string unchanged.
265            if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
266                return s;
267            }
268    
269            int end;
270    
271            // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs,
272            // and line break characters.
273            for (end = s.length() - 1; end >= 0; end--) {
274                int ch = s.charAt(end);
275                if (ch != ' ' && ch != '\t' ) {
276                    break;
277                }
278            }
279    
280            // did we actually find something to remove?  Shorten the String to the trimmed length
281            if (end != s.length() - 1) {
282                s = s.substring(0, end + 1);
283            }
284    
285            // does the string as it exists now not require folding?  We can just had that back right off.
286            if (s.length() + used <= FOLD_THRESHOLD) {
287                return s;
288            }
289    
290            // get a buffer for the length of the string, plus room for a few line breaks.
291            // these are soft line breaks, so we generally need more that just the line breaks (an escape +
292            // CR + LF + leading space on next line);
293            StringBuffer newString = new StringBuffer(s.length() + 8);
294    
295    
296            // now keep chopping this down until we've accomplished what we need.
297            while (used + s.length() > FOLD_THRESHOLD) {
298                int breakPoint = -1;
299                char breakChar = 0;
300    
301                // now scan for the next place where we can break.
302                for (int i = 0; i < s.length(); i++) {
303                    // have we passed the fold limit?
304                    if (used + i > FOLD_THRESHOLD) {
305                        // if we've already seen a blank, then stop now.  Otherwise
306                        // we keep going until we hit a fold point.
307                        if (breakPoint != -1) {
308                            break;
309                        }
310                    }
311                    char ch = s.charAt(i);
312    
313                    // a white space character?
314                    if (ch == ' ' || ch == '\t') {
315                        // this might be a run of white space, so skip over those now.
316                        breakPoint = i;
317                        // we need to maintain the same character type after the inserted linebreak.
318                        breakChar = ch;
319                        i++;
320                        while (i < s.length()) {
321                            ch = s.charAt(i);
322                            if (ch != ' ' && ch != '\t') {
323                                break;
324                            }
325                            i++;
326                        }
327                    }
328                    // found an embedded new line.  Escape this so that the unfolding process preserves it.
329                    else if (ch == '\n') {
330                        newString.append('\\');
331                        newString.append('\n');
332                    }
333                    else if (ch == '\r') {
334                        newString.append('\\');
335                        newString.append('\n');
336                        i++;
337                        // if this is a CRLF pair, add the second char also
338                        if (i < s.length() && s.charAt(i) == '\n') {
339                            newString.append('\r');
340                        }
341                    }
342    
343                }
344                // no fold point found, we punt, append the remainder and leave.
345                if (breakPoint == -1) {
346                    newString.append(s);
347                    return newString.toString();
348                }
349                newString.append(s.substring(0, breakPoint));
350                newString.append("\r\n");
351                newString.append(breakChar);
352                // chop the string
353                s = s.substring(breakPoint + 1);
354                // start again, and we've used the first char of the limit already with the whitespace char.
355                used = 1;
356            }
357    
358            // add on the remainder, and return
359            newString.append(s);
360            return newString.toString();
361        }
362    
363        /**
364         * Unfold a folded string.  The unfolding process will remove
365         * any line breaks that are not escaped and which are also followed
366         * by whitespace characters.
367         *
368         * @param s      The folded string.
369         *
370         * @return A new string with unfolding rules applied.
371         */
372        public static String unfold(String s) {
373            // if folding is disable, unfolding is also.  Return the string unchanged.
374            if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) {
375                return s;
376            }
377    
378            // if there are no line break characters in the string, we can just return this.
379            if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) {
380                return s;
381            }
382    
383            // we need to scan and fix things up.
384            int length = s.length();
385    
386            StringBuffer newString = new StringBuffer(length);
387    
388            // scan the entire string
389            for (int i = 0; i < length; i++) {
390                int ch = s.charAt(i);
391    
392                // we have a backslash.  In folded strings, escape characters are only processed as such if
393                // they preceed line breaks.  Otherwise, we leave it be.
394                if (ch == '\\') {
395                    // escape at the very end?  Just add the character.
396                    if (i == length - 1) {
397                        newString.append(ch);
398                    }
399                    else {
400                        int nextChar = s.charAt(i + 1);
401    
402                        // naked newline?  Add the new line to the buffer, and skip the escape char.
403                        if (nextChar == '\n') {
404                            newString.append('\n');
405                            i++;
406                        }
407                        else if (nextChar == '\r') {
408                            // just the CR left?  Add it, removing the escape.
409                            if (i == length - 2 || s.charAt(i + 2) != '\r') {
410                                newString.append('\r');
411                                i++;
412                            }
413                            else {
414                                // toss the escape, add both parts of the CRLF, and skip over two chars.
415                                newString.append('\r');
416                                newString.append('\n');
417                                i += 2;
418                            }
419                        }
420                        else {
421                            // an escape for another purpose, just copy it over.
422                            newString.append(ch);
423                        }
424                    }
425                }
426                // we have an unescaped line break
427                else if (ch == '\n' || ch == '\r') {
428                    // remember the position in case we need to backtrack.
429                    int lineBreak = i;
430                    boolean CRLF = false;
431    
432                    if (ch == '\r') {
433                        // check to see if we need to step over this.
434                        if (i < length - 1 && s.charAt(i + 1) == '\n') {
435                            i++;
436                            // flag the type so we know what we might need to preserve.
437                            CRLF = true;
438                        }
439                    }
440    
441                    // get a temp position scanner.
442                    int scan = i + 1;
443    
444                    // does a blank follow this new line?  we need to scrap the new line and reduce the leading blanks
445                    // down to a single blank.
446                    if (scan < length && s.charAt(scan) == ' ') {
447                        // add the character
448                        newString.append(' ');
449    
450                        // scan over the rest of the blanks
451                        i = scan + 1;
452                        while (i < length && s.charAt(i) == ' ') {
453                            i++;
454                        }
455                        // we'll increment down below, so back up to the last blank as the current char.
456                        i--;
457                    }
458                    else {
459                        // we must keep this line break.  Append the appropriate style.
460                        if (CRLF) {
461                            newString.append("\r\n");
462                        }
463                        else {
464                            newString.append(ch);
465                        }
466                    }
467                }
468                else {
469                    // just a normal, ordinary character
470                    newString.append(ch);
471                }
472            }
473            return newString.toString();
474        }
475    }