001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.validator; 018 019 import java.io.Serializable; 020 import java.util.Arrays; 021 import java.util.HashSet; 022 import java.util.Set; 023 024 import org.apache.commons.validator.routines.InetAddressValidator; 025 import org.apache.commons.validator.util.Flags; 026 import org.apache.oro.text.perl.Perl5Util; 027 028 /** 029 * <p>Validates URLs.</p> 030 * Behavour of validation is modified by passing in options: 031 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 032 * component.</li> 033 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 034 * included then fragments are flagged as illegal.</li> 035 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 036 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 037 * 038 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 039 * http://javascript.internet.com. However, this validation now bears little resemblance 040 * to the php original.</p> 041 * <pre> 042 * Example of usage: 043 * Construct a UrlValidator with valid schemes of "http", and "https". 044 * 045 * String[] schemes = {"http","https"}. 046 * UrlValidator urlValidator = new UrlValidator(schemes); 047 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 048 * System.out.println("url is valid"); 049 * } else { 050 * System.out.println("url is invalid"); 051 * } 052 * 053 * prints "url is invalid" 054 * If instead the default constructor is used. 055 * 056 * UrlValidator urlValidator = new UrlValidator(); 057 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 058 * System.out.println("url is valid"); 059 * } else { 060 * System.out.println("url is invalid"); 061 * } 062 * 063 * prints out "url is valid" 064 * </pre> 065 * 066 * @see 067 * <a href='http://www.ietf.org/rfc/rfc2396.txt' > 068 * Uniform Resource Identifiers (URI): Generic Syntax 069 * </a> 070 * 071 * @version $Revision: 588054 $ $Date: 2007-10-25 00:12:12 +0200 (Do, 25. Okt 2007) $ 072 * @since Validator 1.1 073 * @deprecated Use the new UrlValidator in the routines package. This class 074 * will be removed in a future release. 075 */ 076 public class UrlValidator implements Serializable { 077 078 /** 079 * Allows all validly formatted schemes to pass validation instead of 080 * supplying a set of valid schemes. 081 */ 082 public static final int ALLOW_ALL_SCHEMES = 1 << 0; 083 084 /** 085 * Allow two slashes in the path component of the URL. 086 */ 087 public static final int ALLOW_2_SLASHES = 1 << 1; 088 089 /** 090 * Enabling this options disallows any URL fragments. 091 */ 092 public static final int NO_FRAGMENTS = 1 << 2; 093 094 private static final String ALPHA_CHARS = "a-zA-Z"; 095 096 private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; 097 098 private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; 099 100 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; 101 102 private static final String SCHEME_CHARS = ALPHA_CHARS; 103 104 // Drop numeric, and "+-." for now 105 private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\."; 106 107 private static final String ATOM = VALID_CHARS + '+'; 108 109 /** 110 * This expression derived/taken from the BNF for URI (RFC2396). 111 */ 112 private static final String URL_PATTERN = 113 "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/"; 114 // 12 3 4 5 6 7 8 9 115 116 /** 117 * Schema/Protocol (ie. http:, ftp:, file:, etc). 118 */ 119 private static final int PARSE_URL_SCHEME = 2; 120 121 /** 122 * Includes hostname/ip and port number. 123 */ 124 private static final int PARSE_URL_AUTHORITY = 4; 125 126 private static final int PARSE_URL_PATH = 5; 127 128 private static final int PARSE_URL_QUERY = 7; 129 130 private static final int PARSE_URL_FRAGMENT = 9; 131 132 /** 133 * Protocol (ie. http:, ftp:,https:). 134 */ 135 private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/"; 136 137 private static final String AUTHORITY_PATTERN = 138 "/^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/"; 139 // 1 2 3 4 140 141 private static final int PARSE_AUTHORITY_HOST_IP = 1; 142 143 private static final int PARSE_AUTHORITY_PORT = 2; 144 145 /** 146 * Should always be empty. 147 */ 148 private static final int PARSE_AUTHORITY_EXTRA = 3; 149 150 private static final String PATH_PATTERN = "/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/"; 151 152 private static final String QUERY_PATTERN = "/^(.*)$/"; 153 154 private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/"; 155 156 private static final String DOMAIN_PATTERN = 157 "/^" + ATOM + "(\\." + ATOM + ")*$/"; 158 159 private static final String PORT_PATTERN = "/^:(\\d{1,5})$/"; 160 161 private static final String ATOM_PATTERN = "/(" + ATOM + ")/"; 162 163 private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/"; 164 165 /** 166 * Holds the set of current validation options. 167 */ 168 private Flags options = null; 169 170 /** 171 * The set of schemes that are allowed to be in a URL. 172 */ 173 private Set allowedSchemes = new HashSet(); 174 175 /** 176 * If no schemes are provided, default to this set. 177 */ 178 protected String[] defaultSchemes = {"http", "https", "ftp"}; 179 180 /** 181 * Create a UrlValidator with default properties. 182 */ 183 public UrlValidator() { 184 this(null); 185 } 186 187 /** 188 * Behavior of validation is modified by passing in several strings options: 189 * @param schemes Pass in one or more url schemes to consider valid, passing in 190 * a null will default to "http,https,ftp" being valid. 191 * If a non-null schemes is specified then all valid schemes must 192 * be specified. Setting the ALLOW_ALL_SCHEMES option will 193 * ignore the contents of schemes. 194 */ 195 public UrlValidator(String[] schemes) { 196 this(schemes, 0); 197 } 198 199 /** 200 * Initialize a UrlValidator with the given validation options. 201 * @param options The options should be set using the public constants declared in 202 * this class. To set multiple options you simply add them together. For example, 203 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 204 */ 205 public UrlValidator(int options) { 206 this(null, options); 207 } 208 209 /** 210 * Behavour of validation is modified by passing in options: 211 * @param schemes The set of valid schemes. 212 * @param options The options should be set using the public constants declared in 213 * this class. To set multiple options you simply add them together. For example, 214 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 215 */ 216 public UrlValidator(String[] schemes, int options) { 217 this.options = new Flags(options); 218 219 if (this.options.isOn(ALLOW_ALL_SCHEMES)) { 220 return; 221 } 222 223 if (schemes == null) { 224 schemes = this.defaultSchemes; 225 } 226 227 this.allowedSchemes.addAll(Arrays.asList(schemes)); 228 } 229 230 /** 231 * <p>Checks if a field has a valid url address.</p> 232 * 233 * @param value The value validation is being performed on. A <code>null</code> 234 * value is considered invalid. 235 * @return true if the url is valid. 236 */ 237 public boolean isValid(String value) { 238 if (value == null) { 239 return false; 240 } 241 242 Perl5Util matchUrlPat = new Perl5Util(); 243 Perl5Util matchAsciiPat = new Perl5Util(); 244 245 if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) { 246 return false; 247 } 248 249 // Check the whole url address structure 250 if (!matchUrlPat.match(URL_PATTERN, value)) { 251 return false; 252 } 253 254 if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) { 255 return false; 256 } 257 258 if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) { 259 return false; 260 } 261 262 if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) { 263 return false; 264 } 265 266 if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) { 267 return false; 268 } 269 270 if (!isValidFragment(matchUrlPat.group(PARSE_URL_FRAGMENT))) { 271 return false; 272 } 273 274 return true; 275 } 276 277 /** 278 * Validate scheme. If schemes[] was initialized to a non null, 279 * then only those scheme's are allowed. Note this is slightly different 280 * than for the constructor. 281 * @param scheme The scheme to validate. A <code>null</code> value is considered 282 * invalid. 283 * @return true if valid. 284 */ 285 protected boolean isValidScheme(String scheme) { 286 if (scheme == null) { 287 return false; 288 } 289 290 Perl5Util schemeMatcher = new Perl5Util(); 291 if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) { 292 return false; 293 } 294 295 if (this.options.isOff(ALLOW_ALL_SCHEMES)) { 296 297 if (!this.allowedSchemes.contains(scheme)) { 298 return false; 299 } 300 } 301 302 return true; 303 } 304 305 /** 306 * Returns true if the authority is properly formatted. An authority is the combination 307 * of hostname and port. A <code>null</code> authority value is considered invalid. 308 * @param authority Authority value to validate. 309 * @return true if authority (hostname and port) is valid. 310 */ 311 protected boolean isValidAuthority(String authority) { 312 if (authority == null) { 313 return false; 314 } 315 316 Perl5Util authorityMatcher = new Perl5Util(); 317 InetAddressValidator inetAddressValidator = 318 InetAddressValidator.getInstance(); 319 320 if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) { 321 return false; 322 } 323 324 boolean hostname = false; 325 // check if authority is IP address or hostname 326 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 327 boolean ipV4Address = inetAddressValidator.isValid(hostIP); 328 329 if (!ipV4Address) { 330 // Domain is hostname name 331 Perl5Util domainMatcher = new Perl5Util(); 332 hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP); 333 } 334 335 //rightmost hostname will never start with a digit. 336 if (hostname) { 337 // LOW-TECH FIX FOR VALIDATOR-202 338 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 339 char[] chars = hostIP.toCharArray(); 340 int size = 1; 341 for(int i=0; i<chars.length; i++) { 342 if(chars[i] == '.') { 343 size++; 344 } 345 } 346 String[] domainSegment = new String[size]; 347 boolean match = true; 348 int segmentCount = 0; 349 int segmentLength = 0; 350 Perl5Util atomMatcher = new Perl5Util(); 351 352 while (match) { 353 match = atomMatcher.match(ATOM_PATTERN, hostIP); 354 if (match) { 355 domainSegment[segmentCount] = atomMatcher.group(1); 356 segmentLength = domainSegment[segmentCount].length() + 1; 357 hostIP = 358 (segmentLength >= hostIP.length()) 359 ? "" 360 : hostIP.substring(segmentLength); 361 362 segmentCount++; 363 } 364 } 365 String topLevel = domainSegment[segmentCount - 1]; 366 if (topLevel.length() < 2 || topLevel.length() > 4) { 367 return false; 368 } 369 370 // First letter of top level must be a alpha 371 Perl5Util alphaMatcher = new Perl5Util(); 372 if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) { 373 return false; 374 } 375 376 // Make sure there's a host name preceding the authority. 377 if (segmentCount < 2) { 378 return false; 379 } 380 } 381 382 if (!hostname && !ipV4Address) { 383 return false; 384 } 385 386 String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 387 if (port != null) { 388 Perl5Util portMatcher = new Perl5Util(); 389 if (!portMatcher.match(PORT_PATTERN, port)) { 390 return false; 391 } 392 } 393 394 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 395 if (!GenericValidator.isBlankOrNull(extra)) { 396 return false; 397 } 398 399 return true; 400 } 401 402 /** 403 * Returns true if the path is valid. A <code>null</code> value is considered invalid. 404 * @param path Path value to validate. 405 * @return true if path is valid. 406 */ 407 protected boolean isValidPath(String path) { 408 if (path == null) { 409 return false; 410 } 411 412 Perl5Util pathMatcher = new Perl5Util(); 413 414 if (!pathMatcher.match(PATH_PATTERN, path)) { 415 return false; 416 } 417 418 int slash2Count = countToken("//", path); 419 if (this.options.isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) { 420 return false; 421 } 422 423 int slashCount = countToken("/", path); 424 int dot2Count = countToken("..", path); 425 if (dot2Count > 0) { 426 if ((slashCount - slash2Count - 1) <= dot2Count) { 427 return false; 428 } 429 } 430 431 return true; 432 } 433 434 /** 435 * Returns true if the query is null or it's a properly formatted query string. 436 * @param query Query value to validate. 437 * @return true if query is valid. 438 */ 439 protected boolean isValidQuery(String query) { 440 if (query == null) { 441 return true; 442 } 443 444 Perl5Util queryMatcher = new Perl5Util(); 445 return queryMatcher.match(QUERY_PATTERN, query); 446 } 447 448 /** 449 * Returns true if the given fragment is null or fragments are allowed. 450 * @param fragment Fragment value to validate. 451 * @return true if fragment is valid. 452 */ 453 protected boolean isValidFragment(String fragment) { 454 if (fragment == null) { 455 return true; 456 } 457 458 return this.options.isOff(NO_FRAGMENTS); 459 } 460 461 /** 462 * Returns the number of times the token appears in the target. 463 * @param token Token value to be counted. 464 * @param target Target value to count tokens in. 465 * @return the number of tokens. 466 */ 467 protected int countToken(String token, String target) { 468 int tokenIndex = 0; 469 int count = 0; 470 while (tokenIndex != -1) { 471 tokenIndex = target.indexOf(token, tokenIndex); 472 if (tokenIndex > -1) { 473 tokenIndex++; 474 count++; 475 } 476 } 477 return count; 478 } 479 }