1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is the Feed Content Sniffer.
16 : *
17 : * The Initial Developer of the Original Code is Google Inc.
18 : * Portions created by the Initial Developer are Copyright (C) 2006
19 : * the Initial Developer. All Rights Reserved.
20 : *
21 : * Contributor(s):
22 : * Ben Goodger <beng@google.com>
23 : * Robert Sayre <sayrer@gmail.com>
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either the GNU General Public License Version 2 or later (the "GPL"), or
27 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : #include "nsFeedSniffer.h"
40 :
41 : #include "prmem.h"
42 :
43 : #include "nsNetCID.h"
44 : #include "nsXPCOM.h"
45 : #include "nsCOMPtr.h"
46 : #include "nsStringStream.h"
47 :
48 : #include "nsBrowserCompsCID.h"
49 :
50 : #include "nsICategoryManager.h"
51 : #include "nsIServiceManager.h"
52 : #include "nsComponentManagerUtils.h"
53 : #include "nsServiceManagerUtils.h"
54 :
55 : #include "nsIStreamConverterService.h"
56 : #include "nsIStreamConverter.h"
57 :
58 : #include "nsIStreamListener.h"
59 :
60 : #include "nsIHttpChannel.h"
61 : #include "nsIMIMEHeaderParam.h"
62 :
63 : #include "nsMimeTypes.h"
64 :
65 : #define TYPE_ATOM "application/atom+xml"
66 : #define TYPE_RSS "application/rss+xml"
67 : #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
68 :
69 : #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
70 : #define NS_RSS "http://purl.org/rss/1.0/"
71 :
72 : #define MAX_BYTES 512
73 :
74 12 : NS_IMPL_ISUPPORTS3(nsFeedSniffer,
75 : nsIContentSniffer,
76 : nsIStreamListener,
77 : nsIRequestObserver)
78 :
79 : nsresult
80 1 : nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
81 : const PRUint8* data,
82 : PRUint32 length)
83 : {
84 1 : nsresult rv = NS_OK;
85 :
86 1 : mDecodedData = "";
87 2 : nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
88 1 : if (!httpChannel)
89 0 : return NS_ERROR_NO_INTERFACE;
90 :
91 2 : nsCAutoString contentEncoding;
92 2 : httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
93 1 : contentEncoding);
94 1 : if (!contentEncoding.IsEmpty()) {
95 0 : nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
96 0 : if (converterService) {
97 0 : ToLowerCase(contentEncoding);
98 :
99 0 : nsCOMPtr<nsIStreamListener> converter;
100 0 : rv = converterService->AsyncConvertData(contentEncoding.get(),
101 : "uncompressed", this, nsnull,
102 0 : getter_AddRefs(converter));
103 0 : NS_ENSURE_SUCCESS(rv, rv);
104 :
105 0 : converter->OnStartRequest(request, nsnull);
106 :
107 : nsCOMPtr<nsIStringInputStream> rawStream =
108 0 : do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
109 0 : if (!rawStream)
110 0 : return NS_ERROR_FAILURE;
111 :
112 0 : rv = rawStream->SetData((const char*)data, length);
113 0 : NS_ENSURE_SUCCESS(rv, rv);
114 :
115 0 : rv = converter->OnDataAvailable(request, nsnull, rawStream, 0, length);
116 0 : NS_ENSURE_SUCCESS(rv, rv);
117 :
118 0 : converter->OnStopRequest(request, nsnull, NS_OK);
119 : }
120 : }
121 1 : return rv;
122 : }
123 :
124 : template<int N>
125 : static bool
126 : StringBeginsWithLowercaseLiteral(nsAString& aString,
127 : const char (&aSubstring)[N])
128 : {
129 : return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
130 : }
131 :
132 : bool
133 0 : HasAttachmentDisposition(nsIHttpChannel* httpChannel)
134 : {
135 0 : if (!httpChannel)
136 0 : return false;
137 :
138 : PRUint32 disp;
139 0 : nsresult rv = httpChannel->GetContentDisposition(&disp);
140 :
141 0 : if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
142 0 : return true;
143 :
144 0 : return false;
145 : }
146 :
147 : /**
148 : * @return the first occurrence of a character within a string buffer,
149 : * or nsnull if not found
150 : */
151 : static const char*
152 0 : FindChar(char c, const char *begin, const char *end)
153 : {
154 0 : for (; begin < end; ++begin) {
155 0 : if (*begin == c)
156 0 : return begin;
157 : }
158 0 : return nsnull;
159 : }
160 :
161 : /**
162 : *
163 : * Determine if a substring is the "documentElement" in the document.
164 : *
165 : * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
166 : * element within the XML DOM, i.e. the root container element. Otherwise,
167 : * it's possible that someone embedded one of these tags inside a document of
168 : * another type, e.g. a HTML document, and we don't want to show the preview
169 : * page if the document isn't actually a feed.
170 : *
171 : * @param start
172 : * The beginning of the data being sniffed
173 : * @param end
174 : * The end of the data being sniffed, right before the substring that
175 : * was found.
176 : * @returns true if the found substring is the documentElement, false
177 : * otherwise.
178 : */
179 : static bool
180 0 : IsDocumentElement(const char *start, const char* end)
181 : {
182 : // For every tag in the buffer, check to see if it's a PI, Doctype or
183 : // comment, our desired substring or something invalid.
184 0 : while ( (start = FindChar('<', start, end)) ) {
185 0 : ++start;
186 0 : if (start >= end)
187 0 : return false;
188 :
189 : // Check to see if the character following the '<' is either '?' or '!'
190 : // (processing instruction or doctype or comment)... these are valid nodes
191 : // to have in the prologue.
192 0 : if (*start != '?' && *start != '!')
193 0 : return false;
194 :
195 : // Now advance the iterator until the '>' (We do this because we don't want
196 : // to sniff indicator substrings that are embedded within other nodes, e.g.
197 : // comments: <!-- <rdf:RDF .. > -->
198 0 : start = FindChar('>', start, end);
199 0 : if (!start)
200 0 : return false;
201 :
202 0 : ++start;
203 : }
204 0 : return true;
205 : }
206 :
207 : /**
208 : * Determines whether or not a string exists as the root element in an XML data
209 : * string buffer.
210 : * @param dataString
211 : * The data being sniffed
212 : * @param substring
213 : * The substring being tested for existence and root-ness.
214 : * @returns true if the substring exists and is the documentElement, false
215 : * otherwise.
216 : */
217 : static bool
218 3 : ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
219 : {
220 3 : PRInt32 offset = dataString.Find(substring);
221 3 : if (offset == -1)
222 3 : return false;
223 :
224 0 : const char *begin = dataString.BeginReading();
225 :
226 : // Only do the validation when we find the substring.
227 0 : return IsDocumentElement(begin, begin + offset);
228 : }
229 :
230 : NS_IMETHODIMP
231 4 : nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
232 : const PRUint8* data,
233 : PRUint32 length,
234 : nsACString& sniffedType)
235 : {
236 8 : nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
237 4 : if (!channel)
238 3 : return NS_ERROR_NO_INTERFACE;
239 :
240 : // Check that this is a GET request, since you can't subscribe to a POST...
241 2 : nsCAutoString method;
242 1 : channel->GetRequestMethod(method);
243 1 : if (!method.Equals("GET")) {
244 0 : sniffedType.Truncate();
245 0 : return NS_OK;
246 : }
247 :
248 : // We need to find out if this is a load of a view-source document. In this
249 : // case we do not want to override the content type, since the source display
250 : // does not need to be converted from feed format to XUL. More importantly,
251 : // we don't want to change the content type from something
252 : // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
253 : // etc) to something that only the application fe knows about (maybe.feed)
254 : // thus deactivating syntax highlighting.
255 2 : nsCOMPtr<nsIURI> originalURI;
256 1 : channel->GetOriginalURI(getter_AddRefs(originalURI));
257 :
258 2 : nsCAutoString scheme;
259 1 : originalURI->GetScheme(scheme);
260 1 : if (scheme.EqualsLiteral("view-source")) {
261 0 : sniffedType.Truncate();
262 0 : return NS_OK;
263 : }
264 :
265 : // Check the Content-Type to see if it is set correctly. If it is set to
266 : // something specific that we think is a reliable indication of a feed, don't
267 : // bother sniffing since we assume the site maintainer knows what they're
268 : // doing.
269 2 : nsCAutoString contentType;
270 1 : channel->GetContentType(contentType);
271 1 : bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
272 1 : contentType.EqualsLiteral(TYPE_ATOM);
273 :
274 : // Check to see if this was a feed request from the location bar or from
275 : // the feed: protocol. This is also a reliable indication.
276 : // The value of the header doesn't matter.
277 1 : if (!noSniff) {
278 2 : nsCAutoString sniffHeader;
279 : nsresult foundHeader =
280 2 : channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
281 1 : sniffHeader);
282 1 : noSniff = NS_SUCCEEDED(foundHeader);
283 : }
284 :
285 1 : if (noSniff) {
286 : // check for an attachment after we have a likely feed.
287 0 : if(HasAttachmentDisposition(channel)) {
288 0 : sniffedType.Truncate();
289 0 : return NS_OK;
290 : }
291 :
292 : // set the feed header as a response header, since we have good metadata
293 : // telling us that the feed is supposed to be RSS or Atom
294 0 : channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
295 0 : NS_LITERAL_CSTRING("1"), false);
296 0 : sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
297 0 : return NS_OK;
298 : }
299 :
300 : // Don't sniff arbitrary types. Limit sniffing to situations that
301 : // we think can reasonably arise.
302 1 : if (!contentType.EqualsLiteral(TEXT_HTML) &&
303 0 : !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
304 : // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
305 : // and check for text/xml and application/xml by hand instead?
306 0 : contentType.Find("xml") == -1) {
307 0 : sniffedType.Truncate();
308 0 : return NS_OK;
309 : }
310 :
311 : // Now we need to potentially decompress data served with
312 : // Content-Encoding: gzip
313 1 : nsresult rv = ConvertEncodedData(request, data, length);
314 1 : if (NS_FAILED(rv))
315 0 : return rv;
316 :
317 : const char* testData =
318 1 : mDecodedData.IsEmpty() ? (const char*)data : mDecodedData.get();
319 :
320 : // The strategy here is based on that described in:
321 : // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
322 : // for interoperarbility purposes.
323 :
324 : // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
325 : // false positives by accidentally reading document content, e.g. a "how to
326 : // make a feed" page.
327 1 : if (length > MAX_BYTES)
328 0 : length = MAX_BYTES;
329 :
330 : // Thus begins the actual sniffing.
331 2 : nsDependentCSubstring dataString((const char*)testData, length);
332 :
333 1 : bool isFeed = false;
334 :
335 : // RSS 0.91/0.92/2.0
336 1 : isFeed = ContainsTopLevelSubstring(dataString, "<rss");
337 :
338 : // Atom 1.0
339 1 : if (!isFeed)
340 1 : isFeed = ContainsTopLevelSubstring(dataString, "<feed");
341 :
342 : // RSS 1.0
343 1 : if (!isFeed) {
344 1 : isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
345 0 : dataString.Find(NS_RDF) != -1 &&
346 1 : dataString.Find(NS_RSS) != -1;
347 : }
348 :
349 : // If we sniffed a feed, coerce our internal type
350 1 : if (isFeed && !HasAttachmentDisposition(channel))
351 0 : sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
352 : else
353 1 : sniffedType.Truncate();
354 1 : return NS_OK;
355 : }
356 :
357 : NS_IMETHODIMP
358 0 : nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
359 : {
360 0 : return NS_OK;
361 : }
362 :
363 : NS_METHOD
364 0 : nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
365 : void* closure,
366 : const char* rawSegment,
367 : PRUint32 toOffset,
368 : PRUint32 count,
369 : PRUint32* writeCount)
370 : {
371 0 : nsCString* decodedData = static_cast<nsCString*>(closure);
372 0 : decodedData->Append(rawSegment, count);
373 0 : *writeCount = count;
374 0 : return NS_OK;
375 : }
376 :
377 : NS_IMETHODIMP
378 0 : nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
379 : nsIInputStream* stream, PRUint32 offset,
380 : PRUint32 count)
381 : {
382 : PRUint32 read;
383 : return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
384 0 : &read);
385 : }
386 :
387 : NS_IMETHODIMP
388 0 : nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
389 : nsresult status)
390 : {
391 0 : return NS_OK;
392 : }
|