/* * @(#)URI.java 0.3-3 06/05/2001 * * This file is part of the HTTPClient package * Copyright (C) 1996-2001 Ronald Tschalär * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307, USA * * For questions, suggestions, bug-reports, enhancement-requests etc. * I may be contacted at: * * ronald@innovation.ch * * The HTTPClient's home page is located at: * * http://www.innovation.ch/java/HTTPClient/ * */ package HTTPClient; import java.net.URL; import java.net.MalformedURLException; import java.util.BitSet; import java.util.Hashtable; /** * This class represents a generic URI, as defined in RFC-2396. * This is similar to java.net.URL, with the following enhancements: * * *

The elements are always stored in escaped form. * *

While RFC-2396 distinguishes between just two forms of URI's, those that * follow the generic syntax and those that don't, this class knows about a * third form, named semi-generic, used by quite a few popular schemes. * Semi-generic syntax treats the path part as opaque, i.e. has the form * <scheme>://<authority>/<opaque> . Relative URI's of this * type are only resolved as far as absolute paths - relative paths do not * exist. * *

Ideally, java.net.URL should subclass URI. * * @see rfc-2396 * @version 0.3-3 06/05/2001 * @author Ronald Tschalär * @since V0.3-1 */ public class URI { /** * If true, then the parser will resolve certain URI's in backwards * compatible (but technically incorrect) manner. Example: * *

     * base   = http://a/b/c/d;p?q
     * rel    = http:g
     * result = http:g		(correct)
     * result = http://a/b/c/g	(backwards compatible)
     *
* * See rfc-2396, section 5.2, step 3, second paragraph. */ public static final boolean ENABLE_BACKWARDS_COMPATIBILITY = true; protected static final Hashtable defaultPorts = new Hashtable(); protected static final Hashtable usesGenericSyntax = new Hashtable(); protected static final Hashtable usesSemiGenericSyntax = new Hashtable(); /* various character classes as defined in the draft */ protected static final BitSet alphanumChar; protected static final BitSet markChar; protected static final BitSet reservedChar; protected static final BitSet unreservedChar; protected static final BitSet uricChar; protected static final BitSet pcharChar; protected static final BitSet userinfoChar; protected static final BitSet schemeChar; protected static final BitSet hostChar; protected static final BitSet opaqueChar; protected static final BitSet reg_nameChar; /* These are not directly in the spec, but used for escaping and * unescaping parts */ /** list of characters which must not be unescaped when unescaping a scheme */ public static final BitSet resvdSchemeChar; /** list of characters which must not be unescaped when unescaping a userinfo */ public static final BitSet resvdUIChar; /** list of characters which must not be unescaped when unescaping a host */ public static final BitSet resvdHostChar; /** list of characters which must not be unescaped when unescaping a path */ public static final BitSet resvdPathChar; /** list of characters which must not be unescaped when unescaping a query string */ public static final BitSet resvdQueryChar; /** list of characters which must not be escaped when escaping a path */ public static final BitSet escpdPathChar; /** list of characters which must not be escaped when escaping a query string */ public static final BitSet escpdQueryChar; /** list of characters which must not be escaped when escaping a fragment identifier */ public static final BitSet escpdFragChar; static { defaultPorts.put("http", new Integer(80)); defaultPorts.put("shttp", new Integer(80)); defaultPorts.put("http-ng", new Integer(80)); defaultPorts.put("coffee", new Integer(80)); defaultPorts.put("https", new Integer(443)); defaultPorts.put("ftp", new Integer(21)); defaultPorts.put("telnet", new Integer(23)); defaultPorts.put("nntp", new Integer(119)); defaultPorts.put("news", new Integer(119)); defaultPorts.put("snews", new Integer(563)); defaultPorts.put("hnews", new Integer(80)); defaultPorts.put("smtp", new Integer(25)); defaultPorts.put("gopher", new Integer(70)); defaultPorts.put("wais", new Integer(210)); defaultPorts.put("whois", new Integer(43)); defaultPorts.put("whois++", new Integer(63)); defaultPorts.put("rwhois", new Integer(4321)); defaultPorts.put("imap", new Integer(143)); defaultPorts.put("pop", new Integer(110)); defaultPorts.put("prospero", new Integer(1525)); defaultPorts.put("irc", new Integer(194)); defaultPorts.put("ldap", new Integer(389)); defaultPorts.put("nfs", new Integer(2049)); defaultPorts.put("z39.50r", new Integer(210)); defaultPorts.put("z39.50s", new Integer(210)); defaultPorts.put("vemmi", new Integer(575)); defaultPorts.put("videotex", new Integer(516)); defaultPorts.put("cmp", new Integer(829)); usesGenericSyntax.put("http", Boolean.TRUE); usesGenericSyntax.put("https", Boolean.TRUE); usesGenericSyntax.put("shttp", Boolean.TRUE); usesGenericSyntax.put("coffee", Boolean.TRUE); usesGenericSyntax.put("ftp", Boolean.TRUE); usesGenericSyntax.put("file", Boolean.TRUE); usesGenericSyntax.put("nntp", Boolean.TRUE); usesGenericSyntax.put("news", Boolean.TRUE); usesGenericSyntax.put("snews", Boolean.TRUE); usesGenericSyntax.put("hnews", Boolean.TRUE); usesGenericSyntax.put("imap", Boolean.TRUE); usesGenericSyntax.put("wais", Boolean.TRUE); usesGenericSyntax.put("nfs", Boolean.TRUE); usesGenericSyntax.put("sip", Boolean.TRUE); usesGenericSyntax.put("sips", Boolean.TRUE); usesGenericSyntax.put("sipt", Boolean.TRUE); usesGenericSyntax.put("sipu", Boolean.TRUE); /* Note: schemes which definitely don't use the generic-URI syntax * and must therefore never appear in the above list: * "urn", "mailto", "sdp", "service", "tv", "gsm-sms", "tel", "fax", * "modem", "eid", "cid", "mid", "data", "ldap" */ usesSemiGenericSyntax.put("ldap", Boolean.TRUE); usesSemiGenericSyntax.put("irc", Boolean.TRUE); usesSemiGenericSyntax.put("gopher", Boolean.TRUE); usesSemiGenericSyntax.put("videotex", Boolean.TRUE); usesSemiGenericSyntax.put("rwhois", Boolean.TRUE); usesSemiGenericSyntax.put("whois++", Boolean.TRUE); usesSemiGenericSyntax.put("smtp", Boolean.TRUE); usesSemiGenericSyntax.put("telnet", Boolean.TRUE); usesSemiGenericSyntax.put("prospero", Boolean.TRUE); usesSemiGenericSyntax.put("pop", Boolean.TRUE); usesSemiGenericSyntax.put("vemmi", Boolean.TRUE); usesSemiGenericSyntax.put("z39.50r", Boolean.TRUE); usesSemiGenericSyntax.put("z39.50s", Boolean.TRUE); usesSemiGenericSyntax.put("stream", Boolean.TRUE); usesSemiGenericSyntax.put("cmp", Boolean.TRUE); alphanumChar = new BitSet(128); for (int ch='0'; ch<='9'; ch++) alphanumChar.set(ch); for (int ch='A'; ch<='Z'; ch++) alphanumChar.set(ch); for (int ch='a'; ch<='z'; ch++) alphanumChar.set(ch); markChar = new BitSet(128); markChar.set('-'); markChar.set('_'); markChar.set('.'); markChar.set('!'); markChar.set('~'); markChar.set('*'); markChar.set('\''); markChar.set('('); markChar.set(')'); reservedChar = new BitSet(128); reservedChar.set(';'); reservedChar.set('/'); reservedChar.set('?'); reservedChar.set(':'); reservedChar.set('@'); reservedChar.set('&'); reservedChar.set('='); reservedChar.set('+'); reservedChar.set('$'); reservedChar.set(','); unreservedChar = new BitSet(128); unreservedChar.or(alphanumChar); unreservedChar.or(markChar); uricChar = new BitSet(128); uricChar.or(unreservedChar); uricChar.or(reservedChar); uricChar.set('%'); pcharChar = new BitSet(128); pcharChar.or(unreservedChar); pcharChar.set('%'); pcharChar.set(':'); pcharChar.set('@'); pcharChar.set('&'); pcharChar.set('='); pcharChar.set('+'); pcharChar.set('$'); pcharChar.set(','); userinfoChar = new BitSet(128); userinfoChar.or(unreservedChar); userinfoChar.set('%'); userinfoChar.set(';'); userinfoChar.set(':'); userinfoChar.set('&'); userinfoChar.set('='); userinfoChar.set('+'); userinfoChar.set('$'); userinfoChar.set(','); // this actually shouldn't contain uppercase letters... schemeChar = new BitSet(128); schemeChar.or(alphanumChar); schemeChar.set('+'); schemeChar.set('-'); schemeChar.set('.'); opaqueChar = new BitSet(128); opaqueChar.or(uricChar); hostChar = new BitSet(128); hostChar.or(alphanumChar); hostChar.set('-'); hostChar.set('.'); reg_nameChar = new BitSet(128); reg_nameChar.or(unreservedChar); reg_nameChar.set('$'); reg_nameChar.set(','); reg_nameChar.set(';'); reg_nameChar.set(':'); reg_nameChar.set('@'); reg_nameChar.set('&'); reg_nameChar.set('='); reg_nameChar.set('+'); resvdSchemeChar = new BitSet(128); resvdSchemeChar.set(':'); resvdUIChar = new BitSet(128); resvdUIChar.set('@'); resvdHostChar = new BitSet(128); resvdHostChar.set(':'); resvdHostChar.set('/'); resvdHostChar.set('?'); resvdHostChar.set('#'); resvdPathChar = new BitSet(128); resvdPathChar.set('/'); resvdPathChar.set(';'); resvdPathChar.set('?'); resvdPathChar.set('#'); resvdQueryChar = new BitSet(128); resvdQueryChar.set('#'); escpdPathChar = new BitSet(128); escpdPathChar.or(pcharChar); escpdPathChar.set('%'); escpdPathChar.set('/'); escpdPathChar.set(';'); escpdQueryChar = new BitSet(128); escpdQueryChar.or(uricChar); escpdQueryChar.clear('#'); escpdFragChar = new BitSet(128); escpdFragChar.or(uricChar); } /* our uri in pieces */ protected static final int OPAQUE = 0; protected static final int SEMI_GENERIC = 1; protected static final int GENERIC = 2; protected int type; protected String scheme; protected String opaque; protected String userinfo; protected String host; protected int port = -1; protected String path; protected String query; protected String fragment; /* cache the java.net.URL */ protected URL url = null; // Constructors /** * Constructs a URI from the given string representation. The string * must be an absolute URI. * * @param uri a String containing an absolute URI * @exception ParseException if no scheme can be found or a specified * port cannot be parsed as a number */ public URI(String uri) throws ParseException { this((URI) null, uri); } /** * Constructs a URI from the given string representation, relative to * the given base URI. * * @param base the base URI, relative to which rel_uri * is to be parsed * @param rel_uri a String containing a relative or absolute URI * @exception ParseException if base is null and * rel_uri is not an absolute URI, or * if base is not null and the scheme * is not known to use the generic syntax, or * if a given port cannot be parsed as a number */ public URI(URI base, String rel_uri) throws ParseException { /* Parsing is done according to the following RE: * * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * * 2: scheme * 4: authority * 5: path * 7: query * 9: fragment */ char[] uri = rel_uri.toCharArray(); int pos = 0, idx, len = uri.length; // trim() while (pos < len && Character.isWhitespace(uri[pos])) pos++; while (len > 0 && Character.isWhitespace(uri[len-1])) len--; // strip the special "url" or "uri" scheme if (pos < len-3 && uri[pos+3] == ':' && (uri[pos+0] == 'u' || uri[pos+0] == 'U') && (uri[pos+1] == 'r' || uri[pos+1] == 'R') && (uri[pos+2] == 'i' || uri[pos+2] == 'I' || uri[pos+2] == 'l' || uri[pos+2] == 'L')) pos += 4; // get scheme: (([^:/?#]+):)? idx = pos; while (idx < len && uri[idx] != ':' && uri[idx] != '/' && uri[idx] != '?' && uri[idx] != '#') idx++; if (idx < len && uri[idx] == ':') { scheme = rel_uri.substring(pos, idx).trim().toLowerCase(); pos = idx + 1; } // check and resolve scheme String final_scheme = scheme; if (scheme == null) { if (base == null) throw new ParseException("No scheme found"); final_scheme = base.scheme; } // check for generic vs. opaque type = usesGenericSyntax(final_scheme) ? GENERIC : usesSemiGenericSyntax(final_scheme) ? SEMI_GENERIC : OPAQUE; if (type == OPAQUE) { if (base != null && scheme == null) throw new ParseException("Can't resolve relative URI for " + "scheme " + final_scheme); opaque = escape(rel_uri.substring(pos), opaqueChar, true); if (opaque.length() > 0 && opaque.charAt(0) == '/') opaque = "%2F" + opaque.substring(1); return; } // get authority: (//([^/?#]*))? if (pos+1 < len && uri[pos] == '/' && uri[pos+1] == '/') { pos += 2; idx = pos; while (idx < len && uri[idx] != '/' && uri[idx] != '?' && uri[idx] != '#') idx++; parse_authority(rel_uri.substring(pos, idx), final_scheme); pos = idx; } // handle semi-generic and generic uri's if (type == SEMI_GENERIC) { path = escape(rel_uri.substring(pos), uricChar, true); if (path.length() > 0 && path.charAt(0) != '/') path = '/' + path; } else { // get path: ([^?#]*) idx = pos; while (idx < len && uri[idx] != '?' && uri[idx] != '#') idx++; path = escape(rel_uri.substring(pos, idx), escpdPathChar, true); pos = idx; // get query: (\?([^#]*))? if (pos < len && uri[pos] == '?') { pos += 1; idx = pos; while (idx < len && uri[idx] != '#') idx++; this.query = escape(rel_uri.substring(pos, idx), escpdQueryChar, true); pos = idx; } // get fragment: (#(.*))? if (pos < len && uri[pos] == '#') this.fragment = escape(rel_uri.substring(pos+1, len), escpdFragChar, true); } // now resolve the parts relative to the base if (base != null) { if (scheme != null && // resolve scheme !(scheme.equals(base.scheme) && ENABLE_BACKWARDS_COMPATIBILITY)) return; scheme = base.scheme; if (host != null) // resolve authority return; userinfo = base.userinfo; host = base.host; port = base.port; if (type == SEMI_GENERIC) // can't resolve relative paths return; if (path.length() == 0 && query == null) // current doc { path = base.path; query = base.query; return; } if (path.length() == 0 || path.charAt(0) != '/') // relative path { idx = (base.path != null) ? base.path.lastIndexOf('/') : -1; if (idx < 0) path = '/' + path; else path = base.path.substring(0, idx+1) + path; path = canonicalizePath(path); } } } /** * Remove all "/../" and "/./" from path, where possible. Leading "/../"'s * are not removed. * * @param path the path to canonicalize * @return the canonicalized path */ public static String canonicalizePath(String path) { int idx, len = path.length(); if (!((idx = path.indexOf("/.")) != -1 && (idx == len-2 || path.charAt(idx+2) == '/' || (path.charAt(idx+2) == '.' && (idx == len-3 || path.charAt(idx+3) == '/')) ))) return path; char[] p = new char[path.length()]; // clean path path.getChars(0, p.length, p, 0); int beg = 0; for (idx=1; idx beg && p[end] != '/') end--; if (p[end] != '/') continue; if (idx == len-2) end++; idx += 2; } else continue; System.arraycopy(p, idx, p, end, len-idx); len -= idx - end; idx = end; } } return new String(p, 0, len); } /** * Parse the authority specific part */ private void parse_authority(String authority, String scheme) throws ParseException { /* The authority is further parsed according to: * * ^(([^@]*)@?)(\[[^]]*\]|[^:]*)?(:(.*))? * 12 3 4 5 * * 2: userinfo * 3: host * 5: port */ char[] uri = authority.toCharArray(); int pos = 0, idx, len = uri.length; // get userinfo: (([^@]*)@?) idx = pos; while (idx < len && uri[idx] != '@') idx++; if (idx < len && uri[idx] == '@') { this.userinfo = escape(authority.substring(pos, idx), userinfoChar, true); pos = idx + 1; } // get host: (\[[^]]*\]|[^:]*)? idx = pos; if (idx < len && uri[idx] == '[') // IPv6 { while (idx < len && uri[idx] != ']') idx++; if (idx == len) throw new ParseException("No closing ']' found for opening '['"+ " at position " + pos + " in authority `" + authority + "'"); this.host = authority.substring(pos+1, idx); idx++; } else { while (idx < len && uri[idx] != ':') idx++; this.host = escape(authority.substring(pos, idx), uricChar, true); } pos = idx; // get port: (:(.*))? if (pos < (len-1) && uri[pos] == ':') { int p; try { p = Integer.parseInt( unescape(authority.substring(pos+1, len), null)); if (p < 0) throw new NumberFormatException(); } catch (NumberFormatException e) { throw new ParseException(authority.substring(pos+1, len) + " is an invalid port number"); } if (p == defaultPort(scheme)) this.port = -1; else this.port = p; } } /** * Construct a URI from the given URL. * * @param url the URL * @exception ParseException if url.toExternalForm() generates * an invalid string representation */ public URI(URL url) throws ParseException { this((URI) null, url.toExternalForm()); } /** * Constructs a URI from the given parts, using the default port for * this scheme (if known). The parts must be in unescaped form. * * @param scheme the scheme (sometimes known as protocol) * @param host the host * @param path the path part * @exception ParseException if scheme is null */ public URI(String scheme, String host, String path) throws ParseException { this(scheme, null, host, -1, path, null, null); } /** * Constructs a URI from the given parts. The parts must be in unescaped * form. * * @param scheme the scheme (sometimes known as protocol) * @param host the host * @param port the port * @param path the path part * @exception ParseException if scheme is null */ public URI(String scheme, String host, int port, String path) throws ParseException { this(scheme, null, host, port, path, null, null); } /** * Constructs a URI from the given parts. Any part except for the * the scheme may be null. The parts must be in unescaped form. * * @param scheme the scheme (sometimes known as protocol) * @param userinfo the userinfo * @param host the host * @param port the port * @param path the path part * @param query the query string * @param fragment the fragment identifier * @exception ParseException if scheme is null */ public URI(String scheme, String userinfo, String host, int port, String path, String query, String fragment) throws ParseException { if (scheme == null) throw new ParseException("missing scheme"); this.scheme = escape(scheme.trim().toLowerCase(), schemeChar, true); if (userinfo != null) this.userinfo = escape(userinfo.trim(), userinfoChar, true); if (host != null) { host = host.trim(); this.host = isIPV6Addr(host) ? host : escape(host, hostChar, true); } if (port != defaultPort(scheme)) this.port = port; if (path != null) this.path = escape(path.trim(), escpdPathChar, true); // ??? if (query != null) this.query = escape(query.trim(), escpdQueryChar, true); if (fragment != null) this.fragment = escape(fragment.trim(), escpdFragChar, true); type = usesGenericSyntax(scheme) ? GENERIC : SEMI_GENERIC; } private static final boolean isIPV6Addr(String host) { if (host.indexOf(':') < 0) return false; for (int idx=0; idx '9') && ch != ':') return false; } return true; } /** * Constructs an opaque URI from the given parts. * * @param scheme the scheme (sometimes known as protocol) * @param opaque the opaque part * @exception ParseException if scheme is null */ public URI(String scheme, String opaque) throws ParseException { if (scheme == null) throw new ParseException("missing scheme"); this.scheme = escape(scheme.trim().toLowerCase(), schemeChar, true); this.opaque = escape(opaque, opaqueChar, true); type = OPAQUE; } // Class Methods /** * @return true if the scheme should be parsed according to the * generic-URI syntax */ public static boolean usesGenericSyntax(String scheme) { return usesGenericSyntax.containsKey(scheme.trim().toLowerCase()); } /** * @return true if the scheme should be parsed according to a * semi-generic-URI syntax <scheme&tgt;://<hostport>/<opaque> */ public static boolean usesSemiGenericSyntax(String scheme) { return usesSemiGenericSyntax.containsKey(scheme.trim().toLowerCase()); } /** * Return the default port used by a given protocol. * * @param protocol the protocol * @return the port number, or 0 if unknown */ public final static int defaultPort(String protocol) { Integer port = (Integer) defaultPorts.get(protocol.trim().toLowerCase()); return (port != null) ? port.intValue() : 0; } // Instance Methods /** * @return the scheme (often also referred to as protocol) */ public String getScheme() { return scheme; } /** * @return the opaque part, or null if this URI is generic */ public String getOpaque() { return opaque; } /** * @return the host */ public String getHost() { return host; } /** * @return the port, or -1 if it's the default port, or 0 if unknown */ public int getPort() { return port; } /** * @return the user info */ public String getUserinfo() { return userinfo; } /** * @return the path */ public String getPath() { return path; } /** * @return the query string */ public String getQueryString() { return query; } /** * @return the path and query */ public String getPathAndQuery() { if (query == null) return path; if (path == null) return "?" + query; return path + "?" + query; } /** * @return the fragment */ public String getFragment() { return fragment; } /** * Does the scheme specific part of this URI use the generic-URI syntax? * *

In general URI are split into two categories: opaque-URI and * generic-URI. The generic-URI syntax is the syntax most are familiar * with from URLs such as ftp- and http-URLs, which is roughly: *

     * generic-URI = scheme ":" [ "//" server ] [ "/" ] [ path_segments ] [ "?" query ]
     * 
* (see RFC-2396 for exact syntax). Only URLs using the generic-URI syntax * can be used to create and resolve relative URIs. * *

Whether a given scheme is parsed according to the generic-URI * syntax or wether it is treated as opaque is determined by an internal * table of URI schemes. * * @see rfc-2396 */ public boolean isGenericURI() { return (type == GENERIC); } /** * Does the scheme specific part of this URI use the semi-generic-URI syntax? * *

Many schemes which don't follow the full generic syntax actually * follow a reduced form where the path part is treated is opaque. This * is used for example by ldap, smtp, pop, etc, and is roughly *

     * generic-URI = scheme ":" [ "//" server ] [ "/" [ opaque_path ] ]
     * 
* I.e. parsing is identical to the generic-syntax, except that the path * part is not further parsed. URLs using the semi-generic-URI syntax can * be used to create and resolve relative URIs with the restriction that * all paths are treated as absolute. * *

Whether a given scheme is parsed according to the semi-generic-URI * syntax is determined by an internal table of URI schemes. * * @see #isGenericURI() */ public boolean isSemiGenericURI() { return (type == SEMI_GENERIC); } /** * Will try to create a java.net.URL object from this URI. * * @return the URL * @exception MalformedURLException if no handler is available for the * scheme */ public URL toURL() throws MalformedURLException { if (url != null) return url; if (opaque != null) return (url = new URL(scheme + ":" + opaque)); String hostinfo; if (userinfo != null && host != null) hostinfo = userinfo + "@" + host; else if (userinfo != null) hostinfo = userinfo + "@"; else hostinfo = host; StringBuffer file = new StringBuffer(100); assemblePath(file, true, true, false); url = new URL(scheme, hostinfo, port, file.toString()); return url; } private final void assemblePath(StringBuffer buf, boolean printEmpty, boolean incFragment, boolean unescape) { if ((path == null || path.length() == 0) && printEmpty) buf.append('/'); if (path != null) buf.append(unescape ? unescapeNoPE(path, resvdPathChar) : path); if (query != null) { buf.append('?'); buf.append(unescape ? unescapeNoPE(query, resvdQueryChar) : query); } if (fragment != null && incFragment) { buf.append('#'); buf.append(unescape ? unescapeNoPE(fragment, null) : fragment); } } private final String stringify(boolean unescape) { StringBuffer uri = new StringBuffer(100); if (scheme != null) { uri.append(unescape ? unescapeNoPE(scheme, resvdSchemeChar) : scheme); uri.append(':'); } if (opaque != null) // it's an opaque-uri { uri.append(unescape ? unescapeNoPE(opaque, null) : opaque); return uri.toString(); } if (userinfo != null || host != null || port != -1) uri.append("//"); if (userinfo != null) { uri.append(unescape ? unescapeNoPE(userinfo, resvdUIChar) : userinfo); uri.append('@'); } if (host != null) { if (host.indexOf(':') < 0) uri.append(unescape ? unescapeNoPE(host, resvdHostChar) : host); else uri.append('[').append(host).append(']'); } if (port != -1) { uri.append(':'); uri.append(port); } assemblePath(uri, false, true, unescape); return uri.toString(); } /** * @return a string representation of this URI suitable for use in * links, headers, etc. */ public String toExternalForm() { return stringify(false); } /** * Return the URI as string. This differs from toExternalForm() in that * all elements are unescaped before assembly. This is not suitable * for passing to other apps or in header fields and such, and is usually * not what you want. * * @return the URI as a string * @see #toExternalForm() */ public String toString() { return stringify(true); } /** * @return true if other is either a URI or URL and it * matches the current URI */ public boolean equals(Object other) { if (other instanceof URI) { URI o = (URI) other; return (scheme.equals(o.scheme) && ( type == OPAQUE && areEqual(opaque, o.opaque) || type == SEMI_GENERIC && areEqual(userinfo, o.userinfo) && areEqualIC(host, o.host) && port == o.port && areEqual(path, o.path) || type == GENERIC && areEqual(userinfo, o.userinfo) && areEqualIC(host, o.host) && port == o.port && pathsEqual(path, o.path) && areEqual(query, o.query) && areEqual(fragment, o.fragment) )); } if (other instanceof URL) { URL o = (URL) other; String h, f; if (userinfo != null) h = userinfo + "@" + host; else h = host; f = getPathAndQuery(); return (scheme.equalsIgnoreCase(o.getProtocol()) && (type == OPAQUE && opaque.equals(o.getFile()) || type == SEMI_GENERIC && areEqualIC(h, o.getHost()) && (port == o.getPort() || o.getPort() == defaultPort(scheme)) && areEqual(f, o.getFile()) || type == GENERIC && areEqualIC(h, o.getHost()) && (port == o.getPort() || o.getPort() == defaultPort(scheme)) && pathsEqual(f, o.getFile()) && areEqual(fragment, o.getRef()) ) ); } return false; } private static final boolean areEqual(String s1, String s2) { return (s1 == null && s2 == null || s1 != null && s2 != null && (s1.equals(s2) || unescapeNoPE(s1, null).equals(unescapeNoPE(s2, null))) ); } private static final boolean areEqualIC(String s1, String s2) { return (s1 == null && s2 == null || s1 != null && s2 != null && (s1.equalsIgnoreCase(s2) || unescapeNoPE(s1, null).equalsIgnoreCase(unescapeNoPE(s2, null))) ); } private static final boolean pathsEqual(String p1, String p2) { if (p1 == null && p2 == null) return true; if (p1 == null || p2 == null) return false; if (p1.equals(p2)) return true; // ok, so it wasn't that simple. Let's split into parts and compare // unescaped. int pos1 = 0, end1 = p1.length(), pos2 = 0, end2 = p2.length(); while (pos1 < end1 && pos2 < end2) { int start1 = pos1, start2 = pos2; char ch; while (pos1 < end1 && (ch = p1.charAt(pos1)) != '/' && ch != ';') pos1++; while (pos2 < end2 && (ch = p2.charAt(pos2)) != '/' && ch != ';') pos2++; if (pos1 == end1 && pos2 < end2 || pos2 == end2 && pos1 < end1 || pos1 < end1 && pos2 < end2 && p1.charAt(pos1) != p2.charAt(pos2)) return false; if ((!p1.regionMatches(start1, p2, start2, pos1-start1) || (pos1-start1) != (pos2-start2)) && !unescapeNoPE(p1.substring(start1, pos1), null).equals(unescapeNoPE(p2.substring(start2, pos2), null))) return false; pos1++; pos2++; } return (pos1 == end1 && pos2 == end2); } private int hashCode = -1; /** * The hash code is calculated over scheme, host, path, and query. * * @return the hash code */ public int hashCode() { if (hashCode == -1) hashCode = (scheme != null ? unescapeNoPE(scheme, null).hashCode() : 0) + (type == OPAQUE ? (opaque != null ? unescapeNoPE(opaque, null).hashCode() : 0) * 7 : (host != null ? unescapeNoPE(host, null).toLowerCase().hashCode() : 0) * 7 + (path != null ? unescapeNoPE(path, null).hashCode() : 0) * 13 + (query != null ? unescapeNoPE(query, null).hashCode() : 0) * 17); return hashCode; } /** * Escape any character not in the given character class. Characters * greater 255 are always escaped according to ??? . * * @param elem the string to escape * @param allowed_char the BitSet of all allowed characters * @param utf8 if true, will first UTF-8 encode unallowed characters * @return the string with all characters not in allowed_char * escaped */ public static String escape(String elem, BitSet allowed_char, boolean utf8) { return new String(escape(elem.toCharArray(), allowed_char, utf8)); } /** * Escape any character not in the given character class. Characters * greater 255 are always escaped according to ??? . * * @param elem the array of characters to escape * @param allowed_char the BitSet of all allowed characters * @param utf8 if true, will first UTF-8 encode unallowed characters * @return the elem array with all characters not in allowed_char * escaped */ public static char[] escape(char[] elem, BitSet allowed_char, boolean utf8) { int cnt=0; for (int idx=0; idx= 0x0080) cnt += 3; if (elem[idx] >= 0x00800) cnt += 3; if ((elem[idx] & 0xFC00) == 0xD800 && idx+1 < elem.length && (elem[idx+1] & 0xFC00) == 0xDC00) cnt -= 6; } } } if (cnt == 0) return elem; char[] tmp = new char[elem.length + cnt]; for (int idx=0, pos=0; idx> 6) & 0x1F)); pos = enc(tmp, pos, 0x80 | ((c >> 0) & 0x3F)); } else if (!((c & 0xFC00) == 0xD800 && idx+1 < elem.length && (elem[idx+1] & 0xFC00) == 0xDC00)) { pos = enc(tmp, pos, 0xE0 | ((c >> 12) & 0x0F)); pos = enc(tmp, pos, 0x80 | ((c >> 6) & 0x3F)); pos = enc(tmp, pos, 0x80 | ((c >> 0) & 0x3F)); } else { int ch = ((c & 0x03FF) << 10) | (elem[++idx] & 0x03FF); ch += 0x10000; pos = enc(tmp, pos, 0xF0 | ((ch >> 18) & 0x07)); pos = enc(tmp, pos, 0x80 | ((ch >> 12) & 0x3F)); pos = enc(tmp, pos, 0x80 | ((ch >> 6) & 0x3F)); pos = enc(tmp, pos, 0x80 | ((ch >> 0) & 0x3F)); } } else pos = enc(tmp, pos, c); } return tmp; } private static final char[] hex = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'}; private static final int enc(char[] out, int pos, int c) { out[pos++] = '%'; out[pos++] = hex[(c >> 4) & 0xf]; out[pos++] = hex[c & 0xf]; return pos; } /** * Unescape escaped characters (i.e. %xx) except reserved ones. * * @param str the string to unescape * @param reserved the characters which may not be unescaped, or null * @return the unescaped string * @exception ParseException if the two digits following a `%' are * not a valid hex number */ public static final String unescape(String str, BitSet reserved) throws ParseException { if (str == null || str.indexOf('%') == -1) return str; // an optimization char[] buf = str.toCharArray(); char[] res = new char[buf.length]; char[] utf = new char[4]; int utf_idx = 0, utf_len = -1; int didx = 0; for (int sidx=0; sidx buf.length) throw new NumberFormatException(); ch = Integer.parseInt(str.substring(sidx+1,sidx+3), 16); if (ch < 0) throw new NumberFormatException(); sidx += 2; } catch (NumberFormatException e) { /* Hmm, people not reading specs again, so we just * ignore it... throw new ParseException(str.substring(sidx,sidx+3) + " is an invalid code"); */ ch = buf[sidx]; } // check if we're working on a utf-char if (utf_len > 0) { if ((ch & 0xC0) != 0x80) // oops, we misinterpreted { didx = copyBuf(utf, utf_idx, ch, res, didx, reserved, false); utf_len = -1; } else if (utf_idx == utf_len - 1) // end-of-char { if ((utf[0] & 0xE0) == 0xC0) ch = (utf[0] & 0x1F) << 6 | (ch & 0x3F); else if ((utf[0] & 0xF0) == 0xE0) ch = (utf[0] & 0x0F) << 12 | (utf[1] & 0x3F) << 6 | (ch & 0x3F); else ch = (utf[0] & 0x07) << 18 | (utf[1] & 0x3F) << 12 | (utf[2] & 0x3F) << 6 | (ch & 0x3F); if (reserved != null && reserved.get(ch)) didx = copyBuf(utf, utf_idx, ch, res, didx, null, true); else if (utf_len < 4) res[didx++] = (char) ch; else { ch -= 0x10000; res[didx++] = (char) ((ch >> 10) | 0xD800); res[didx++] = (char) ((ch & 0x03FF) | 0xDC00); } utf_len = -1; } else // continue utf[utf_idx++] = (char) ch; } // check if this is the start of a utf-char else if ((ch & 0xE0) == 0xC0 || (ch & 0xF0) == 0xE0 || (ch & 0xF8) == 0xF0) { if ((ch & 0xE0) == 0xC0) utf_len = 2; else if ((ch & 0xF0) == 0xE0) utf_len = 3; else utf_len = 4; utf[0] = (char) ch; utf_idx = 1; } // leave reserved alone else if (reserved != null && reserved.get(ch)) { res[didx++] = buf[sidx]; sidx -= 2; } // just use the decoded version else res[didx++] = (char) ch; } else if (utf_len > 0) // oops, we misinterpreted { didx = copyBuf(utf, utf_idx, buf[sidx], res, didx, reserved, false); utf_len = -1; } else res[didx++] = buf[sidx]; } if (utf_len > 0) // oops, we misinterpreted didx = copyBuf(utf, utf_idx, -1, res, didx, reserved, false); return new String(res, 0, didx); } private static final int copyBuf(char[] utf, int utf_idx, int ch, char[] res, int didx, BitSet reserved, boolean escapeAll) { if (ch >= 0) utf[utf_idx++] = (char) ch; for (int idx=0; idx" + nl + " rel-URI = <" + relURI + ">" + nl+ " expected <" + result + ">" + nl+ " but got <" + new URI(base, relURI) + ">"); } } private static void testEqual(String one, String two) throws Exception { URI u1 = new URI(one); URI u2 = new URI(two); if (!u1.equals(u2)) { throw new Exception("Test failed: " + nl + " <" + one + "> != <" + two + ">"); } if (u1.hashCode() != u2.hashCode()) { throw new Exception("Test failed: " + nl + " hashCode <" + one + "> != hashCode <" + two + ">"); } } private static void testNotEqual(String one, String two) throws Exception { URI u1 = new URI(one); URI u2 = new URI(two); if (u1.equals(u2)) { throw new Exception("Test failed: " + nl + " <" + one + "> == <" + two + ">"); } } private static void testPE(URI base, String uri) throws Exception { boolean got_pe = false; try { new URI(base, uri); } catch (ParseException pe) { got_pe = true; } if (!got_pe) { throw new Exception("Test failed: " + nl + " <" + uri + "> should be invalid"); } } private static void testEscape(String raw, String escaped) throws Exception { String test = new String(escape(raw.toCharArray(), uricChar, true)); if (!test.equals(escaped)) throw new Exception("Test failed: " + nl + " raw-string: " + raw + nl + " escaped: " + test + nl + " expected: " + escaped); } private static void testUnescape(String escaped, String raw) throws Exception { if (!unescape(escaped, null).equals(raw)) throw new Exception("Test failed: " + nl + " escaped-string: " + escaped + nl + " unescaped: " + unescape(escaped, null) + nl + " expected: " + raw); } }