1 : //* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Effective-TLD Service
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Google Inc.
19 : * Portions created by the Initial Developer are Copyright (C) 2006
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Pamela Greene <pamg.bugs@gmail.com> (original author)
24 : * Daniel Witte <dwitte@stanford.edu>
25 : * Jeff Walden <jwalden+code@mit.edu>
26 : *
27 : * Alternatively, the contents of this file may be used under the terms of
28 : * either the GNU General Public License Version 2 or later (the "GPL"), or
29 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 : * in which case the provisions of the GPL or the LGPL are applicable instead
31 : * of those above. If you wish to allow use of your version of this file only
32 : * under the terms of either the GPL or the LGPL, and not to allow others to
33 : * use your version of this file under the terms of the MPL, indicate your
34 : * decision by deleting the provisions above and replace them with the notice
35 : * and other provisions required by the GPL or the LGPL. If you do not delete
36 : * the provisions above, a recipient may use your version of this file under
37 : * the terms of any one of the MPL, the GPL or the LGPL.
38 : *
39 : * ***** END LICENSE BLOCK ***** */
40 :
41 : // This service reads a file of rules describing TLD-like domain names. For a
42 : // complete description of the expected file format and parsing rules, see
43 : // http://wiki.mozilla.org/Gecko:Effective_TLD_Service
44 :
45 : #include "mozilla/Util.h"
46 :
47 : #include "nsEffectiveTLDService.h"
48 : #include "nsIIDNService.h"
49 : #include "nsNetUtil.h"
50 : #include "prnetdb.h"
51 :
52 : #include "mozilla/FunctionTimer.h"
53 :
54 : using namespace mozilla;
55 :
56 3313 : NS_IMPL_ISUPPORTS1(nsEffectiveTLDService, nsIEffectiveTLDService)
57 :
58 : // ----------------------------------------------------------------------
59 :
60 : static const ETLDEntry gEntries[] =
61 : #include "etld_data.inc"
62 : ;
63 :
64 : // ----------------------------------------------------------------------
65 :
66 : nsresult
67 302 : nsEffectiveTLDService::Init()
68 : {
69 : NS_TIME_FUNCTION;
70 :
71 : // We'll probably have to rehash at least once, since nsTHashtable doesn't
72 : // use a perfect hash, but at least we'll save a few rehashes along the way.
73 : // Next optimization here is to precompute the hash using something like
74 : // gperf, but one step at a time. :-)
75 302 : if (!mHash.Init(ArrayLength(gEntries) - 1))
76 0 : return NS_ERROR_OUT_OF_MEMORY;
77 :
78 : nsresult rv;
79 302 : mIDNService = do_GetService(NS_IDNSERVICE_CONTRACTID, &rv);
80 302 : if (NS_FAILED(rv)) return rv;
81 :
82 : // Initialize eTLD hash from static array
83 1224006 : for (PRUint32 i = 0; i < ArrayLength(gEntries) - 1; i++) {
84 : #ifdef DEBUG
85 2447408 : nsDependentCString name(gEntries[i].domain);
86 2447408 : nsCAutoString normalizedName(gEntries[i].domain);
87 1223704 : NS_ASSERTION(NS_SUCCEEDED(NormalizeHostname(normalizedName)),
88 : "normalization failure!");
89 1223704 : NS_ASSERTION(name.Equals(normalizedName), "domain not normalized!");
90 : #endif
91 1223704 : nsDomainEntry *entry = mHash.PutEntry(gEntries[i].domain);
92 1223704 : NS_ENSURE_TRUE(entry, NS_ERROR_OUT_OF_MEMORY);
93 2447408 : entry->SetData(&gEntries[i]);
94 : }
95 302 : return NS_OK;
96 : }
97 :
98 : // External function for dealing with URI's correctly.
99 : // Pulls out the host portion from an nsIURI, and calls through to
100 : // GetPublicSuffixFromHost().
101 : NS_IMETHODIMP
102 1 : nsEffectiveTLDService::GetPublicSuffix(nsIURI *aURI,
103 : nsACString &aPublicSuffix)
104 : {
105 1 : NS_ENSURE_ARG_POINTER(aURI);
106 :
107 2 : nsCOMPtr<nsIURI> innerURI = NS_GetInnermostURI(aURI);
108 1 : NS_ENSURE_ARG_POINTER(innerURI);
109 :
110 2 : nsCAutoString host;
111 1 : nsresult rv = innerURI->GetAsciiHost(host);
112 1 : if (NS_FAILED(rv)) return rv;
113 :
114 1 : return GetBaseDomainInternal(host, 0, aPublicSuffix);
115 : }
116 :
117 : // External function for dealing with URI's correctly.
118 : // Pulls out the host portion from an nsIURI, and calls through to
119 : // GetBaseDomainFromHost().
120 : NS_IMETHODIMP
121 18944 : nsEffectiveTLDService::GetBaseDomain(nsIURI *aURI,
122 : PRUint32 aAdditionalParts,
123 : nsACString &aBaseDomain)
124 : {
125 18944 : NS_ENSURE_ARG_POINTER(aURI);
126 :
127 37888 : nsCOMPtr<nsIURI> innerURI = NS_GetInnermostURI(aURI);
128 18944 : NS_ENSURE_ARG_POINTER(innerURI);
129 :
130 37888 : nsCAutoString host;
131 18944 : nsresult rv = innerURI->GetAsciiHost(host);
132 18944 : if (NS_FAILED(rv)) return rv;
133 :
134 18943 : return GetBaseDomainInternal(host, aAdditionalParts + 1, aBaseDomain);
135 : }
136 :
137 : // External function for dealing with a host string directly: finds the public
138 : // suffix (e.g. co.uk) for the given hostname. See GetBaseDomainInternal().
139 : NS_IMETHODIMP
140 4330 : nsEffectiveTLDService::GetPublicSuffixFromHost(const nsACString &aHostname,
141 : nsACString &aPublicSuffix)
142 : {
143 : // Create a mutable copy of the hostname and normalize it to ACE.
144 : // This will fail if the hostname includes invalid characters.
145 8660 : nsCAutoString normHostname(aHostname);
146 4330 : nsresult rv = NormalizeHostname(normHostname);
147 4330 : if (NS_FAILED(rv)) return rv;
148 :
149 4330 : return GetBaseDomainInternal(normHostname, 0, aPublicSuffix);
150 : }
151 :
152 : // External function for dealing with a host string directly: finds the base
153 : // domain (e.g. www.co.uk) for the given hostname and number of subdomain parts
154 : // requested. See GetBaseDomainInternal().
155 : NS_IMETHODIMP
156 7620 : nsEffectiveTLDService::GetBaseDomainFromHost(const nsACString &aHostname,
157 : PRUint32 aAdditionalParts,
158 : nsACString &aBaseDomain)
159 : {
160 : // Create a mutable copy of the hostname and normalize it to ACE.
161 : // This will fail if the hostname includes invalid characters.
162 15240 : nsCAutoString normHostname(aHostname);
163 7620 : nsresult rv = NormalizeHostname(normHostname);
164 7620 : if (NS_FAILED(rv)) return rv;
165 :
166 7620 : return GetBaseDomainInternal(normHostname, aAdditionalParts + 1, aBaseDomain);
167 : }
168 :
169 : // Finds the base domain for a host, with requested number of additional parts.
170 : // This will fail, generating an error, if the host is an IPv4/IPv6 address,
171 : // if more subdomain parts are requested than are available, or if the hostname
172 : // includes characters that are not valid in a URL. Normalization is performed
173 : // on the host string and the result will be in UTF8.
174 : nsresult
175 30894 : nsEffectiveTLDService::GetBaseDomainInternal(nsCString &aHostname,
176 : PRUint32 aAdditionalParts,
177 : nsACString &aBaseDomain)
178 : {
179 30894 : if (aHostname.IsEmpty())
180 39 : return NS_ERROR_INSUFFICIENT_DOMAIN_LEVELS;
181 :
182 : // chomp any trailing dot, and keep track of it for later
183 30855 : bool trailingDot = aHostname.Last() == '.';
184 30855 : if (trailingDot)
185 43 : aHostname.Truncate(aHostname.Length() - 1);
186 :
187 : // check the edge cases of the host being '.' or having a second trailing '.',
188 : // since subsequent checks won't catch it.
189 30855 : if (aHostname.IsEmpty() || aHostname.Last() == '.')
190 9 : return NS_ERROR_INVALID_ARG;
191 :
192 : // Check if we're dealing with an IPv4/IPv6 hostname, and return
193 : PRNetAddr addr;
194 30846 : PRStatus result = PR_StringToNetAddr(aHostname.get(), &addr);
195 30846 : if (result == PR_SUCCESS)
196 106 : return NS_ERROR_HOST_IS_IP_ADDRESS;
197 :
198 : // Walk up the domain tree, most specific to least specific,
199 : // looking for matches at each level. Note that a given level may
200 : // have multiple attributes (e.g. IsWild() and IsNormal()).
201 30740 : const char *prevDomain = nsnull;
202 30740 : const char *currDomain = aHostname.get();
203 30740 : const char *nextDot = strchr(currDomain, '.');
204 30740 : const char *end = currDomain + aHostname.Length();
205 30740 : const char *eTLD = currDomain;
206 28487 : while (1) {
207 : // sanity check the string we're about to look up: it should not begin with
208 : // a '.'; this would mean the hostname began with a '.' or had an
209 : // embedded '..' sequence.
210 59227 : if (*currDomain == '.')
211 5 : return NS_ERROR_INVALID_ARG;
212 :
213 : // perform the hash lookup.
214 59222 : nsDomainEntry *entry = mHash.GetEntry(currDomain);
215 59222 : if (entry) {
216 4073 : if (entry->IsWild() && prevDomain) {
217 : // wildcard rules imply an eTLD one level inferior to the match.
218 110 : eTLD = prevDomain;
219 110 : break;
220 :
221 3963 : } else if (entry->IsNormal() || !nextDot) {
222 : // specific match, or we've hit the top domain level
223 3866 : eTLD = currDomain;
224 3866 : break;
225 :
226 97 : } else if (entry->IsException()) {
227 : // exception rules imply an eTLD one level superior to the match.
228 97 : eTLD = nextDot + 1;
229 97 : break;
230 : }
231 : }
232 :
233 55149 : if (!nextDot) {
234 : // we've hit the top domain level; use it by default.
235 26662 : eTLD = currDomain;
236 26662 : break;
237 : }
238 :
239 28487 : prevDomain = currDomain;
240 28487 : currDomain = nextDot + 1;
241 28487 : nextDot = strchr(currDomain, '.');
242 : }
243 :
244 : // count off the number of requested domains.
245 30735 : const char *begin = aHostname.get();
246 30735 : const char *iter = eTLD;
247 118803 : while (1) {
248 149538 : if (iter == begin)
249 25516 : break;
250 :
251 124022 : if (*(--iter) == '.' && aAdditionalParts-- == 0) {
252 5219 : ++iter;
253 5219 : ++aAdditionalParts;
254 5219 : break;
255 : }
256 : }
257 :
258 30735 : if (aAdditionalParts != 0)
259 3179 : return NS_ERROR_INSUFFICIENT_DOMAIN_LEVELS;
260 :
261 27556 : aBaseDomain = Substring(iter, end);
262 : // add on the trailing dot, if applicable
263 27556 : if (trailingDot)
264 20 : aBaseDomain.Append('.');
265 :
266 27556 : return NS_OK;
267 : }
268 :
269 : // Normalizes the given hostname, component by component. ASCII/ACE
270 : // components are lower-cased, and UTF-8 components are normalized per
271 : // RFC 3454 and converted to ACE.
272 : nsresult
273 1235654 : nsEffectiveTLDService::NormalizeHostname(nsCString &aHostname)
274 : {
275 1235654 : if (!IsASCII(aHostname)) {
276 246 : nsresult rv = mIDNService->ConvertUTF8toACE(aHostname, aHostname);
277 246 : if (NS_FAILED(rv))
278 0 : return rv;
279 : }
280 :
281 1235654 : ToLowerCase(aHostname);
282 1235654 : return NS_OK;
283 : }
|