001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019 020 package org.apache.geronimo.mail.util; 021 022 import java.io.BufferedInputStream; 023 import java.io.InputStream; 024 import java.io.IOException; 025 026 027 /** 028 * Set of utility classes for handling common encoding-related 029 * manipulations. 030 * 031 * @version $Rev: 467553 $ $Date: 2006-10-25 06:01:51 +0200 (Mi, 25. Okt 2006) $ 032 */ 033 public class ASCIIUtil { 034 private static final String MIME_FOLDTEXT = "mail.mime.foldtext"; 035 private static final int FOLD_THRESHOLD = 76; 036 037 /** 038 * Test to see if this string contains only US-ASCII (i.e., 7-bit 039 * ASCII) charactes. 040 * 041 * @param s The test string. 042 * 043 * @return true if this is a valid 7-bit ASCII encoding, false if it 044 * contains any non-US ASCII characters. 045 */ 046 static public boolean isAscii(String s) { 047 for (int i = 0; i < s.length(); i++) { 048 if (!isAscii(s.charAt(i))) { 049 return false; 050 } 051 } 052 return true; 053 } 054 055 /** 056 * Test to see if a given character can be considered "valid" ASCII. 057 * The excluded characters are the control characters less than 058 * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and 059 * tab characters ARE considered value (all less than 32). 060 * 061 * @param ch The test character. 062 * 063 * @return true if this character meets the "ascii-ness" criteria, false 064 * otherwise. 065 */ 066 static public boolean isAscii(int ch) { 067 // these are explicitly considered valid. 068 if (ch == '\r' || ch == '\n' || ch == '\t') { 069 return true; 070 } 071 072 // anything else outside the range is just plain wrong. 073 if (ch >= 127 || ch < 32) { 074 return false; 075 } 076 return true; 077 } 078 079 080 /** 081 * Examine a stream of text and make a judgement on what encoding 082 * type should be used for the text. Ideally, we want to use 7bit 083 * encoding to determine this, but we may need to use either quoted-printable 084 * or base64. The choice is made on the ratio of 7-bit characters to non-7bit. 085 * 086 * @param content An input stream for the content we're examining. 087 * 088 * @exception IOException 089 */ 090 public static String getTextTransferEncoding(InputStream content) throws IOException { 091 092 // for efficiency, we'll read in blocks. 093 BufferedInputStream in = new BufferedInputStream(content, 4096); 094 095 int span = 0; // span of characters without a line break. 096 boolean containsLongLines = false; 097 int asciiChars = 0; 098 int nonAsciiChars = 0; 099 100 while (true) { 101 int ch = in.read(); 102 // if we hit an EOF here, go decide what type we've actually found. 103 if (ch == -1) { 104 break; 105 } 106 107 // we found a linebreak. Reset the line length counters on either one. We don't 108 // really need to validate here. 109 if (ch == '\n' || ch == '\r') { 110 // hit a line end, reset our line length counter 111 span = 0; 112 } 113 else { 114 span++; 115 // the text has long lines, we can't transfer this as unencoded text. 116 if (span > 998) { 117 containsLongLines = true; 118 } 119 120 // non-ascii character, we have to transfer this in binary. 121 if (!isAscii(ch)) { 122 nonAsciiChars++; 123 } 124 else { 125 asciiChars++; 126 } 127 } 128 } 129 130 // looking good so far, only valid chars here. 131 if (nonAsciiChars == 0) { 132 // does this contain long text lines? We need to use a Q-P encoding which will 133 // be only slightly longer, but handles folding the longer lines. 134 if (containsLongLines) { 135 return "quoted-printable"; 136 } 137 else { 138 // ideal! Easiest one to handle. 139 return "7bit"; 140 } 141 } 142 else { 143 // mostly characters requiring encoding? Base64 is our best bet. 144 if (nonAsciiChars > asciiChars) { 145 return "base64"; 146 } 147 else { 148 // Q-P encoding will use fewer bytes than the full Base64. 149 return "quoted-printable"; 150 } 151 } 152 } 153 154 155 /** 156 * Examine a stream of text and make a judgement on what encoding 157 * type should be used for the text. Ideally, we want to use 7bit 158 * encoding to determine this, but we may need to use either quoted-printable 159 * or base64. The choice is made on the ratio of 7-bit characters to non-7bit. 160 * 161 * @param content A string for the content we're examining. 162 */ 163 public static String getTextTransferEncoding(String content) { 164 165 int asciiChars = 0; 166 int nonAsciiChars = 0; 167 168 for (int i = 0; i < content.length(); i++) { 169 int ch = content.charAt(i); 170 171 // non-ascii character, we have to transfer this in binary. 172 if (!isAscii(ch)) { 173 nonAsciiChars++; 174 } 175 else { 176 asciiChars++; 177 } 178 } 179 180 // looking good so far, only valid chars here. 181 if (nonAsciiChars == 0) { 182 // ideal! Easiest one to handle. 183 return "7bit"; 184 } 185 else { 186 // mostly characters requiring encoding? Base64 is our best bet. 187 if (nonAsciiChars > asciiChars) { 188 return "base64"; 189 } 190 else { 191 // Q-P encoding will use fewer bytes than the full Base64. 192 return "quoted-printable"; 193 } 194 } 195 } 196 197 198 /** 199 * Determine if the transfer encoding looks like it might be 200 * valid ascii text, and thus transferable as 7bit code. In 201 * order for this to be true, all characters must be valid 202 * 7-bit ASCII code AND all line breaks must be properly formed 203 * (JUST '\r\n' sequences). 7-bit transfers also 204 * typically have a line limit of 1000 bytes (998 + the CRLF), so any 205 * stretch of charactes longer than that will also force Base64 encoding. 206 * 207 * @param content An input stream for the content we're examining. 208 * 209 * @exception IOException 210 */ 211 public static String getBinaryTransferEncoding(InputStream content) throws IOException { 212 213 // for efficiency, we'll read in blocks. 214 BufferedInputStream in = new BufferedInputStream(content, 4096); 215 216 int previousChar = 0; 217 int span = 0; // span of characters without a line break. 218 219 while (true) { 220 int ch = in.read(); 221 // if we hit an EOF here, we've only found valid text so far, so we can transfer this as 222 // 7-bit ascii. 223 if (ch == -1) { 224 return "7bit"; 225 } 226 227 // we found a newline, this is only valid if the previous char was the '\r' 228 if (ch == '\n') { 229 // malformed linebreak? force this to base64 encoding. 230 if (previousChar != '\r') { 231 return "base64"; 232 } 233 // hit a line end, reset our line length counter 234 span = 0; 235 } 236 else { 237 span++; 238 // the text has long lines, we can't transfer this as unencoded text. 239 if (span > 998) { 240 return "base64"; 241 } 242 243 // non-ascii character, we have to transfer this in binary. 244 if (!isAscii(ch)) { 245 return "base64"; 246 } 247 } 248 previousChar = ch; 249 } 250 } 251 252 253 /** 254 * Perform RFC 2047 text folding on a string of text. 255 * 256 * @param used The amount of text already "used up" on this line. This is 257 * typically the length of a message header that this text 258 * get getting added to. 259 * @param s The text to fold. 260 * 261 * @return The input text, with linebreaks inserted at appropriate fold points. 262 */ 263 public static String fold(int used, String s) { 264 // if folding is disable, unfolding is also. Return the string unchanged. 265 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) { 266 return s; 267 } 268 269 int end; 270 271 // now we need to strip off any trailing "whitespace", where whitespace is blanks, tabs, 272 // and line break characters. 273 for (end = s.length() - 1; end >= 0; end--) { 274 int ch = s.charAt(end); 275 if (ch != ' ' && ch != '\t' ) { 276 break; 277 } 278 } 279 280 // did we actually find something to remove? Shorten the String to the trimmed length 281 if (end != s.length() - 1) { 282 s = s.substring(0, end + 1); 283 } 284 285 // does the string as it exists now not require folding? We can just had that back right off. 286 if (s.length() + used <= FOLD_THRESHOLD) { 287 return s; 288 } 289 290 // get a buffer for the length of the string, plus room for a few line breaks. 291 // these are soft line breaks, so we generally need more that just the line breaks (an escape + 292 // CR + LF + leading space on next line); 293 StringBuffer newString = new StringBuffer(s.length() + 8); 294 295 296 // now keep chopping this down until we've accomplished what we need. 297 while (used + s.length() > FOLD_THRESHOLD) { 298 int breakPoint = -1; 299 char breakChar = 0; 300 301 // now scan for the next place where we can break. 302 for (int i = 0; i < s.length(); i++) { 303 // have we passed the fold limit? 304 if (used + i > FOLD_THRESHOLD) { 305 // if we've already seen a blank, then stop now. Otherwise 306 // we keep going until we hit a fold point. 307 if (breakPoint != -1) { 308 break; 309 } 310 } 311 char ch = s.charAt(i); 312 313 // a white space character? 314 if (ch == ' ' || ch == '\t') { 315 // this might be a run of white space, so skip over those now. 316 breakPoint = i; 317 // we need to maintain the same character type after the inserted linebreak. 318 breakChar = ch; 319 i++; 320 while (i < s.length()) { 321 ch = s.charAt(i); 322 if (ch != ' ' && ch != '\t') { 323 break; 324 } 325 i++; 326 } 327 } 328 // found an embedded new line. Escape this so that the unfolding process preserves it. 329 else if (ch == '\n') { 330 newString.append('\\'); 331 newString.append('\n'); 332 } 333 else if (ch == '\r') { 334 newString.append('\\'); 335 newString.append('\n'); 336 i++; 337 // if this is a CRLF pair, add the second char also 338 if (i < s.length() && s.charAt(i) == '\n') { 339 newString.append('\r'); 340 } 341 } 342 343 } 344 // no fold point found, we punt, append the remainder and leave. 345 if (breakPoint == -1) { 346 newString.append(s); 347 return newString.toString(); 348 } 349 newString.append(s.substring(0, breakPoint)); 350 newString.append("\r\n"); 351 newString.append(breakChar); 352 // chop the string 353 s = s.substring(breakPoint + 1); 354 // start again, and we've used the first char of the limit already with the whitespace char. 355 used = 1; 356 } 357 358 // add on the remainder, and return 359 newString.append(s); 360 return newString.toString(); 361 } 362 363 /** 364 * Unfold a folded string. The unfolding process will remove 365 * any line breaks that are not escaped and which are also followed 366 * by whitespace characters. 367 * 368 * @param s The folded string. 369 * 370 * @return A new string with unfolding rules applied. 371 */ 372 public static String unfold(String s) { 373 // if folding is disable, unfolding is also. Return the string unchanged. 374 if (!SessionUtil.getBooleanProperty(MIME_FOLDTEXT, true)) { 375 return s; 376 } 377 378 // if there are no line break characters in the string, we can just return this. 379 if (s.indexOf('\n') < 0 && s.indexOf('\r') < 0) { 380 return s; 381 } 382 383 // we need to scan and fix things up. 384 int length = s.length(); 385 386 StringBuffer newString = new StringBuffer(length); 387 388 // scan the entire string 389 for (int i = 0; i < length; i++) { 390 int ch = s.charAt(i); 391 392 // we have a backslash. In folded strings, escape characters are only processed as such if 393 // they preceed line breaks. Otherwise, we leave it be. 394 if (ch == '\\') { 395 // escape at the very end? Just add the character. 396 if (i == length - 1) { 397 newString.append(ch); 398 } 399 else { 400 int nextChar = s.charAt(i + 1); 401 402 // naked newline? Add the new line to the buffer, and skip the escape char. 403 if (nextChar == '\n') { 404 newString.append('\n'); 405 i++; 406 } 407 else if (nextChar == '\r') { 408 // just the CR left? Add it, removing the escape. 409 if (i == length - 2 || s.charAt(i + 2) != '\r') { 410 newString.append('\r'); 411 i++; 412 } 413 else { 414 // toss the escape, add both parts of the CRLF, and skip over two chars. 415 newString.append('\r'); 416 newString.append('\n'); 417 i += 2; 418 } 419 } 420 else { 421 // an escape for another purpose, just copy it over. 422 newString.append(ch); 423 } 424 } 425 } 426 // we have an unescaped line break 427 else if (ch == '\n' || ch == '\r') { 428 // remember the position in case we need to backtrack. 429 int lineBreak = i; 430 boolean CRLF = false; 431 432 if (ch == '\r') { 433 // check to see if we need to step over this. 434 if (i < length - 1 && s.charAt(i + 1) == '\n') { 435 i++; 436 // flag the type so we know what we might need to preserve. 437 CRLF = true; 438 } 439 } 440 441 // get a temp position scanner. 442 int scan = i + 1; 443 444 // does a blank follow this new line? we need to scrap the new line and reduce the leading blanks 445 // down to a single blank. 446 if (scan < length && s.charAt(scan) == ' ') { 447 // add the character 448 newString.append(' '); 449 450 // scan over the rest of the blanks 451 i = scan + 1; 452 while (i < length && s.charAt(i) == ' ') { 453 i++; 454 } 455 // we'll increment down below, so back up to the last blank as the current char. 456 i--; 457 } 458 else { 459 // we must keep this line break. Append the appropriate style. 460 if (CRLF) { 461 newString.append("\r\n"); 462 } 463 else { 464 newString.append(ch); 465 } 466 } 467 } 468 else { 469 // just a normal, ordinary character 470 newString.append(ch); 471 } 472 } 473 return newString.toString(); 474 } 475 }