1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is inline spellchecker code.
16 : *
17 : * The Initial Developer of the Original Code is Google Inc.
18 : * Portions created by the Initial Developer are Copyright (C) 2004-2006
19 : * the Initial Developer. All Rights Reserved.
20 : *
21 : * Contributor(s):
22 : * Brett Wilson <brettw@gmail.com> (original author)
23 : * Robert O'Callahan <rocallahan@novell.com>
24 : * Ms2ger <ms2ger@gmail.com>
25 : *
26 : * Alternatively, the contents of this file may be used under the terms of
27 : * either the GNU General Public License Version 2 or later (the "GPL"), or
28 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 : * in which case the provisions of the GPL or the LGPL are applicable instead
30 : * of those above. If you wish to allow use of your version of this file only
31 : * under the terms of either the GPL or the LGPL, and not to allow others to
32 : * use your version of this file under the terms of the MPL, indicate your
33 : * decision by deleting the provisions above and replace them with the notice
34 : * and other provisions required by the GPL or the LGPL. If you do not delete
35 : * the provisions above, a recipient may use your version of this file under
36 : * the terms of any one of the MPL, the GPL or the LGPL.
37 : *
38 : * ***** END LICENSE BLOCK ***** */
39 :
40 : #include "mozInlineSpellWordUtil.h"
41 : #include "nsDebug.h"
42 : #include "nsIAtom.h"
43 : #include "nsComponentManagerUtils.h"
44 : #include "nsIDOMCSSStyleDeclaration.h"
45 : #include "nsIDOMElement.h"
46 : #include "nsIDOMRange.h"
47 : #include "nsIEditor.h"
48 : #include "nsIDOMNode.h"
49 : #include "nsIDOMHTMLBRElement.h"
50 : #include "nsUnicharUtilCIID.h"
51 : #include "nsUnicodeProperties.h"
52 : #include "nsServiceManagerUtils.h"
53 : #include "nsIContent.h"
54 : #include "nsTextFragment.h"
55 : #include "mozilla/dom/Element.h"
56 : #include "nsIFrame.h"
57 : #include "nsRange.h"
58 : #include "nsContentUtils.h"
59 :
60 : using namespace mozilla;
61 :
62 : // IsIgnorableCharacter
63 : //
64 : // These characters are ones that we should ignore in input.
65 :
66 0 : inline bool IsIgnorableCharacter(PRUnichar ch)
67 : {
68 : return (ch == 0x200D || // ZERO-WIDTH JOINER
69 : ch == 0xAD || // SOFT HYPHEN
70 0 : ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
71 : }
72 :
73 : // IsConditionalPunctuation
74 : //
75 : // Some characters (like apostrophes) require characters on each side to be
76 : // part of a word, and are otherwise punctuation.
77 :
78 0 : inline bool IsConditionalPunctuation(PRUnichar ch)
79 : {
80 : return (ch == '\'' ||
81 0 : ch == 0x2019); // RIGHT SINGLE QUOTATION MARK
82 : }
83 :
84 : // mozInlineSpellWordUtil::Init
85 :
86 : nsresult
87 0 : mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
88 : {
89 : nsresult rv;
90 :
91 : // getting the editor can fail commonly because the editor was detached, so
92 : // don't assert
93 0 : nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
94 0 : if (NS_FAILED(rv))
95 0 : return rv;
96 :
97 0 : nsCOMPtr<nsIDOMDocument> domDoc;
98 0 : rv = editor->GetDocument(getter_AddRefs(domDoc));
99 0 : NS_ENSURE_SUCCESS(rv, rv);
100 0 : NS_ENSURE_TRUE(domDoc, NS_ERROR_NULL_POINTER);
101 :
102 0 : mDOMDocument = domDoc;
103 0 : mDocument = do_QueryInterface(domDoc);
104 :
105 : // Find the root node for the editor. For contenteditable we'll need something
106 : // cleverer here.
107 0 : nsCOMPtr<nsIDOMElement> rootElt;
108 0 : rv = editor->GetRootElement(getter_AddRefs(rootElt));
109 0 : NS_ENSURE_SUCCESS(rv, rv);
110 :
111 0 : nsCOMPtr<nsINode> rootNode = do_QueryInterface(rootElt);
112 0 : mRootNode = rootNode;
113 0 : NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
114 0 : return NS_OK;
115 : }
116 :
117 : static inline bool
118 0 : IsTextNode(nsINode* aNode)
119 : {
120 0 : return aNode->IsNodeOfType(nsINode::eTEXT);
121 : }
122 :
123 : typedef void (* OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
124 :
125 : // Find the next node in the DOM tree in preorder.
126 : // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
127 : // why we can't just use GetNextNode here, sadly.
128 : static nsINode*
129 0 : FindNextNode(nsINode* aNode, nsINode* aRoot,
130 : OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure)
131 : {
132 0 : NS_PRECONDITION(aNode, "Null starting node?");
133 :
134 0 : nsINode* next = aNode->GetFirstChild();
135 0 : if (next)
136 0 : return next;
137 :
138 : // Don't look at siblings or otherwise outside of aRoot
139 0 : if (aNode == aRoot)
140 0 : return nsnull;
141 :
142 0 : next = aNode->GetNextSibling();
143 0 : if (next)
144 0 : return next;
145 :
146 : // Go up
147 0 : for (;;) {
148 0 : if (aOnLeaveNode) {
149 0 : aOnLeaveNode(aNode, aClosure);
150 : }
151 :
152 0 : next = aNode->GetParent();
153 0 : if (next == aRoot || ! next)
154 0 : return nsnull;
155 0 : aNode = next;
156 :
157 0 : next = aNode->GetNextSibling();
158 0 : if (next)
159 0 : return next;
160 : }
161 : }
162 :
163 : // aNode is not a text node. Find the first text node starting at aNode/aOffset
164 : // in a preorder DOM traversal.
165 : static nsINode*
166 0 : FindNextTextNode(nsINode* aNode, PRInt32 aOffset, nsINode* aRoot)
167 : {
168 0 : NS_PRECONDITION(aNode, "Null starting node?");
169 0 : NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
170 :
171 : nsINode* checkNode;
172 : // Need to start at the aOffset'th child
173 0 : nsIContent* child = aNode->GetChildAt(aOffset);
174 :
175 0 : if (child) {
176 0 : checkNode = child;
177 : } else {
178 : // aOffset was beyond the end of the child list.
179 : // goto next node after the last descendant of aNode in
180 : // a preorder DOM traversal.
181 0 : checkNode = aNode->GetNextNonChildNode(aRoot);
182 : }
183 :
184 0 : while (checkNode && !IsTextNode(checkNode)) {
185 0 : checkNode = checkNode->GetNextNode(aRoot);
186 : }
187 0 : return checkNode;
188 : }
189 :
190 : // mozInlineSpellWordUtil::SetEnd
191 : //
192 : // We have two ranges "hard" and "soft". The hard boundary is simply
193 : // the scope of the root node. The soft boundary is that which is set
194 : // by the caller of this class by calling this function. If this function is
195 : // not called, the soft boundary is the same as the hard boundary.
196 : //
197 : // When we reach the soft boundary (mSoftEnd), we keep
198 : // going until we reach the end of a word. This allows the caller to set the
199 : // end of the range to anything, and we will always check whole multiples of
200 : // words. When we reach the hard boundary we stop no matter what.
201 : //
202 : // There is no beginning soft boundary. This is because we only go to the
203 : // previous node once, when finding the previous word boundary in
204 : // SetPosition(). You might think of the soft boundary as being this initial
205 : // position.
206 :
207 : nsresult
208 0 : mozInlineSpellWordUtil::SetEnd(nsINode* aEndNode, PRInt32 aEndOffset)
209 : {
210 0 : NS_PRECONDITION(aEndNode, "Null end node?");
211 :
212 0 : NS_ASSERTION(mRootNode, "Not initialized");
213 :
214 0 : InvalidateWords();
215 :
216 0 : if (!IsTextNode(aEndNode)) {
217 : // End at the start of the first text node after aEndNode/aEndOffset.
218 0 : aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
219 0 : aEndOffset = 0;
220 : }
221 0 : mSoftEnd = NodeOffset(aEndNode, aEndOffset);
222 0 : return NS_OK;
223 : }
224 :
225 : nsresult
226 0 : mozInlineSpellWordUtil::SetPosition(nsINode* aNode, PRInt32 aOffset)
227 : {
228 0 : InvalidateWords();
229 :
230 0 : if (!IsTextNode(aNode)) {
231 : // Start at the start of the first text node after aNode/aOffset.
232 0 : aNode = FindNextTextNode(aNode, aOffset, mRootNode);
233 0 : aOffset = 0;
234 : }
235 0 : mSoftBegin = NodeOffset(aNode, aOffset);
236 :
237 0 : EnsureWords();
238 :
239 0 : PRInt32 textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
240 0 : if (textOffset < 0)
241 0 : return NS_OK;
242 0 : mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
243 0 : return NS_OK;
244 : }
245 :
246 : void
247 0 : mozInlineSpellWordUtil::EnsureWords()
248 : {
249 0 : if (mSoftTextValid)
250 0 : return;
251 0 : BuildSoftText();
252 0 : BuildRealWords();
253 0 : mSoftTextValid = true;
254 : }
255 :
256 : nsresult
257 0 : mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsRange** aRange)
258 : {
259 0 : NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
260 0 : NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
261 0 : return MakeRange(begin, end, aRange);
262 : }
263 :
264 : // mozInlineSpellWordUtil::GetRangeForWord
265 :
266 : nsresult
267 0 : mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
268 : PRInt32 aWordOffset,
269 : nsRange** aRange)
270 : {
271 : // Set our soft end and start
272 0 : nsCOMPtr<nsINode> wordNode = do_QueryInterface(aWordNode);
273 0 : NodeOffset pt = NodeOffset(wordNode, aWordOffset);
274 :
275 0 : InvalidateWords();
276 0 : mSoftBegin = mSoftEnd = pt;
277 0 : EnsureWords();
278 :
279 0 : PRInt32 offset = MapDOMPositionToSoftTextOffset(pt);
280 0 : if (offset < 0)
281 0 : return MakeRange(pt, pt, aRange);
282 0 : PRInt32 wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
283 0 : if (wordIndex < 0)
284 0 : return MakeRange(pt, pt, aRange);
285 0 : return MakeRangeForWord(mRealWords[wordIndex], aRange);
286 : }
287 :
288 : // This is to fix characters that the spellchecker may not like
289 : static void
290 0 : NormalizeWord(const nsSubstring& aInput, PRInt32 aPos, PRInt32 aLen, nsAString& aOutput)
291 : {
292 0 : aOutput.Truncate();
293 0 : for (PRInt32 i = 0; i < aLen; i++) {
294 0 : PRUnichar ch = aInput.CharAt(i + aPos);
295 :
296 : // remove ignorable characters from the word
297 0 : if (IsIgnorableCharacter(ch))
298 0 : continue;
299 :
300 : // the spellchecker doesn't handle curly apostrophes in all languages
301 0 : if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
302 0 : ch = '\'';
303 : }
304 :
305 0 : aOutput.Append(ch);
306 : }
307 0 : }
308 :
309 : // mozInlineSpellWordUtil::GetNextWord
310 : //
311 : // FIXME-optimization: we shouldn't have to generate a range every single
312 : // time. It would be better if the inline spellchecker didn't require a
313 : // range unless the word was misspelled. This may or may not be possible.
314 :
315 : nsresult
316 0 : mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsRange** aRange,
317 : bool* aSkipChecking)
318 : {
319 : #ifdef DEBUG_SPELLCHECK
320 : printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
321 : #endif
322 :
323 0 : if (mNextWordIndex < 0 ||
324 0 : mNextWordIndex >= PRInt32(mRealWords.Length())) {
325 0 : mNextWordIndex = -1;
326 0 : *aRange = nsnull;
327 0 : *aSkipChecking = true;
328 0 : return NS_OK;
329 : }
330 :
331 0 : const RealWord& word = mRealWords[mNextWordIndex];
332 0 : nsresult rv = MakeRangeForWord(word, aRange);
333 0 : NS_ENSURE_SUCCESS(rv, rv);
334 0 : ++mNextWordIndex;
335 0 : *aSkipChecking = !word.mCheckableWord;
336 0 : ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
337 :
338 : #ifdef DEBUG_SPELLCHECK
339 : printf("GetNextWord returning: %s (skip=%d)\n",
340 : NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
341 : #endif
342 :
343 0 : return NS_OK;
344 : }
345 :
346 : // mozInlineSpellWordUtil::MakeRange
347 : //
348 : // Convenience function for creating a range over the current document.
349 :
350 : nsresult
351 0 : mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
352 : nsRange** aRange)
353 : {
354 0 : if (!mDOMDocument)
355 0 : return NS_ERROR_NOT_INITIALIZED;
356 :
357 0 : nsRefPtr<nsRange> range = new nsRange();
358 : nsresult rv = range->Set(aBegin.mNode, aBegin.mOffset,
359 0 : aEnd.mNode, aEnd.mOffset);
360 0 : NS_ENSURE_SUCCESS(rv, rv);
361 0 : range.forget(aRange);
362 :
363 0 : return NS_OK;
364 : }
365 :
366 : /*********** DOM text extraction ************/
367 :
368 : // IsDOMWordSeparator
369 : //
370 : // Determines if the given character should be considered as a DOM Word
371 : // separator. Basically, this is whitespace, although it could also have
372 : // certain punctuation that we know ALWAYS breaks words. This is important.
373 : // For example, we can't have any punctuation that could appear in a URL
374 : // or email address in this, because those need to always fit into a single
375 : // DOM word.
376 :
377 : static bool
378 0 : IsDOMWordSeparator(PRUnichar ch)
379 : {
380 : // simple spaces
381 0 : if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
382 0 : return true;
383 :
384 : // complex spaces - check only if char isn't ASCII (uncommon)
385 0 : if (ch >= 0xA0 &&
386 : (ch == 0x00A0 || // NO-BREAK SPACE
387 : ch == 0x2002 || // EN SPACE
388 : ch == 0x2003 || // EM SPACE
389 : ch == 0x2009 || // THIN SPACE
390 : ch == 0x200C || // ZERO WIDTH NON-JOINER
391 : ch == 0x3000)) // IDEOGRAPHIC SPACE
392 0 : return true;
393 :
394 : // otherwise not a space
395 0 : return false;
396 : }
397 :
398 : static inline bool
399 0 : IsBRElement(nsINode* aNode)
400 : {
401 0 : return aNode->IsElement() &&
402 0 : aNode->AsElement()->IsHTML(nsGkAtoms::br);
403 : }
404 :
405 : /**
406 : * Check if there's a DOM word separator before aBeforeOffset in this node.
407 : * Always returns true if it's a BR element.
408 : * aSeparatorOffset is set to the index of the first character in the last
409 : * separator if any is found (0 for BR elements).
410 : *
411 : * This function does not modify aSeparatorOffset when it returns false.
412 : */
413 : static bool
414 0 : ContainsDOMWordSeparator(nsINode* aNode, PRInt32 aBeforeOffset,
415 : PRInt32* aSeparatorOffset)
416 : {
417 0 : if (IsBRElement(aNode)) {
418 0 : *aSeparatorOffset = 0;
419 0 : return true;
420 : }
421 :
422 0 : if (!IsTextNode(aNode))
423 0 : return false;
424 :
425 : // aNode is actually an nsIContent, since it's eTEXT
426 0 : nsIContent* content = static_cast<nsIContent*>(aNode);
427 0 : const nsTextFragment* textFragment = content->GetText();
428 0 : NS_ASSERTION(textFragment, "Where is our text?");
429 0 : for (PRInt32 i = NS_MIN(aBeforeOffset, PRInt32(textFragment->GetLength())) - 1; i >= 0; --i) {
430 0 : if (IsDOMWordSeparator(textFragment->CharAt(i))) {
431 : // Be greedy, find as many separators as we can
432 0 : for (PRInt32 j = i - 1; j >= 0; --j) {
433 0 : if (IsDOMWordSeparator(textFragment->CharAt(j))) {
434 0 : i = j;
435 : } else {
436 0 : break;
437 : }
438 : }
439 0 : *aSeparatorOffset = i;
440 0 : return true;
441 : }
442 : }
443 0 : return false;
444 : }
445 :
446 : static bool
447 0 : IsBreakElement(nsINode* aNode)
448 : {
449 0 : if (!aNode->IsElement()) {
450 0 : return false;
451 : }
452 :
453 0 : dom::Element *element = aNode->AsElement();
454 :
455 0 : if (element->IsHTML(nsGkAtoms::br))
456 0 : return true;
457 :
458 : // If we don't have a frame, we don't consider ourselves a break
459 : // element. In particular, words can span us.
460 0 : if (!element->GetPrimaryFrame())
461 0 : return false;
462 :
463 : // Anything that's not an inline element is a break element.
464 : // XXXbz should replaced inlines be break elements, though?
465 0 : return element->GetPrimaryFrame()->GetStyleDisplay()->mDisplay !=
466 0 : NS_STYLE_DISPLAY_INLINE;
467 : }
468 :
469 : struct CheckLeavingBreakElementClosure {
470 : bool mLeftBreakElement;
471 : };
472 :
473 : static void
474 0 : CheckLeavingBreakElement(nsINode* aNode, void* aClosure)
475 : {
476 : CheckLeavingBreakElementClosure* cl =
477 0 : static_cast<CheckLeavingBreakElementClosure*>(aClosure);
478 0 : if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
479 0 : cl->mLeftBreakElement = true;
480 : }
481 0 : }
482 :
483 : void
484 0 : mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
485 : {
486 0 : nsAutoString result;
487 0 : ::NormalizeWord(aWord, 0, aWord.Length(), result);
488 0 : aWord = result;
489 0 : }
490 :
491 : void
492 0 : mozInlineSpellWordUtil::BuildSoftText()
493 : {
494 : // First we have to work backwards from mSoftStart to find a text node
495 : // containing a DOM word separator, a non-inline-element
496 : // boundary, or the hard start node. That's where we'll start building the
497 : // soft string from.
498 0 : nsINode* node = mSoftBegin.mNode;
499 0 : PRInt32 firstOffsetInNode = 0;
500 0 : PRInt32 checkBeforeOffset = mSoftBegin.mOffset;
501 0 : while (node) {
502 0 : if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
503 0 : if (node == mSoftBegin.mNode) {
504 : // If we find a word separator on the first node, look at the preceding
505 : // word on the text node as well.
506 0 : PRInt32 newOffset = 0;
507 0 : if (firstOffsetInNode > 0) {
508 : // Try to find the previous word boundary. We ignore the return value
509 : // of ContainsDOMWordSeparator here because there might be no preceding
510 : // word separator (such as when we're at the end of the first word in
511 : // the text node), in which case we just set the found offsets to 0.
512 : // Otherwise, ContainsDOMWordSeparator finds us the correct word
513 : // boundary so that we can avoid looking at too many words.
514 0 : ContainsDOMWordSeparator(node, firstOffsetInNode - 1, &newOffset);
515 : }
516 0 : firstOffsetInNode = newOffset;
517 0 : mSoftBegin.mOffset = newOffset;
518 : }
519 0 : break;
520 : }
521 0 : checkBeforeOffset = PR_INT32_MAX;
522 0 : if (IsBreakElement(node)) {
523 : // Since GetPreviousContent follows tree *preorder*, we're about to traverse
524 : // up out of 'node'. Since node induces breaks (e.g., it's a block),
525 : // don't bother trying to look outside it, just stop now.
526 0 : break;
527 : }
528 : // GetPreviousContent below expects mRootNode to be an ancestor of node.
529 0 : if (!nsContentUtils::ContentIsDescendantOf(node, mRootNode)) {
530 0 : break;
531 : }
532 0 : node = node->GetPreviousContent(mRootNode);
533 : }
534 :
535 : // Now build up the string moving forward through the DOM until we reach
536 : // the soft end and *then* see a DOM word separator, a non-inline-element
537 : // boundary, or the hard end node.
538 0 : mSoftText.Truncate();
539 0 : mSoftTextDOMMapping.Clear();
540 0 : bool seenSoftEnd = false;
541 : // Leave this outside the loop so large heap string allocations can be reused
542 : // across iterations
543 0 : while (node) {
544 0 : if (node == mSoftEnd.mNode) {
545 0 : seenSoftEnd = true;
546 : }
547 :
548 0 : bool exit = false;
549 0 : if (IsTextNode(node)) {
550 0 : nsIContent* content = static_cast<nsIContent*>(node);
551 0 : NS_ASSERTION(content, "Where is our content?");
552 0 : const nsTextFragment* textFragment = content->GetText();
553 0 : NS_ASSERTION(textFragment, "Where is our text?");
554 0 : PRInt32 lastOffsetInNode = textFragment->GetLength();
555 :
556 0 : if (seenSoftEnd) {
557 : // check whether we can stop after this
558 0 : for (PRInt32 i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
559 0 : i < PRInt32(textFragment->GetLength()); ++i) {
560 0 : if (IsDOMWordSeparator(textFragment->CharAt(i))) {
561 0 : exit = true;
562 : // stop at the first separator after the soft end point
563 0 : lastOffsetInNode = i;
564 0 : break;
565 : }
566 : }
567 : }
568 :
569 0 : if (firstOffsetInNode < lastOffsetInNode) {
570 0 : PRInt32 len = lastOffsetInNode - firstOffsetInNode;
571 : mSoftTextDOMMapping.AppendElement(
572 0 : DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
573 0 : textFragment->AppendTo(mSoftText, firstOffsetInNode, len);
574 : }
575 :
576 0 : firstOffsetInNode = 0;
577 : }
578 :
579 0 : if (exit)
580 0 : break;
581 :
582 0 : CheckLeavingBreakElementClosure closure = { false };
583 0 : node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
584 0 : if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
585 : // We left, or are entering, a break element (e.g., block). Maybe we can
586 : // stop now.
587 0 : if (seenSoftEnd)
588 0 : break;
589 : // Record the break
590 0 : mSoftText.Append(' ');
591 : }
592 : }
593 :
594 : #ifdef DEBUG_SPELLCHECK
595 : printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
596 : #endif
597 0 : }
598 :
599 : void
600 0 : mozInlineSpellWordUtil::BuildRealWords()
601 : {
602 : // This is pretty simple. We just have to walk mSoftText, tokenizing it
603 : // into "real words".
604 : // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
605 : // SplitDOMWord on each of those DOM words
606 0 : PRInt32 wordStart = -1;
607 0 : mRealWords.Clear();
608 0 : for (PRInt32 i = 0; i < PRInt32(mSoftText.Length()); ++i) {
609 0 : if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
610 0 : if (wordStart >= 0) {
611 0 : SplitDOMWord(wordStart, i);
612 0 : wordStart = -1;
613 : }
614 : } else {
615 0 : if (wordStart < 0) {
616 0 : wordStart = i;
617 : }
618 : }
619 : }
620 0 : if (wordStart >= 0) {
621 0 : SplitDOMWord(wordStart, mSoftText.Length());
622 : }
623 0 : }
624 :
625 : /*********** DOM/realwords<->mSoftText mapping functions ************/
626 :
627 : PRInt32
628 0 : mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
629 : {
630 0 : if (!mSoftTextValid) {
631 0 : NS_ERROR("Soft text must be valid if we're to map into it");
632 0 : return -1;
633 : }
634 :
635 0 : for (PRInt32 i = 0; i < PRInt32(mSoftTextDOMMapping.Length()); ++i) {
636 0 : const DOMTextMapping& map = mSoftTextDOMMapping[i];
637 0 : if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
638 : // Allow offsets at either end of the string, in particular, allow the
639 : // offset that's at the end of the contributed string
640 : PRInt32 offsetInContributedString =
641 0 : aNodeOffset.mOffset - map.mNodeOffset.mOffset;
642 0 : if (offsetInContributedString >= 0 &&
643 : offsetInContributedString <= map.mLength)
644 0 : return map.mSoftTextOffset + offsetInContributedString;
645 0 : return -1;
646 : }
647 : }
648 0 : return -1;
649 : }
650 :
651 : mozInlineSpellWordUtil::NodeOffset
652 0 : mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(PRInt32 aSoftTextOffset,
653 : DOMMapHint aHint)
654 : {
655 0 : NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
656 0 : if (!mSoftTextValid)
657 0 : return NodeOffset(nsnull, -1);
658 :
659 : // The invariant is that the range start..end includes the last mapping,
660 : // if any, such that mSoftTextOffset <= aSoftTextOffset
661 0 : PRInt32 start = 0;
662 0 : PRInt32 end = mSoftTextDOMMapping.Length();
663 0 : while (end - start >= 2) {
664 0 : PRInt32 mid = (start + end)/2;
665 0 : const DOMTextMapping& map = mSoftTextDOMMapping[mid];
666 0 : if (map.mSoftTextOffset > aSoftTextOffset) {
667 0 : end = mid;
668 : } else {
669 0 : start = mid;
670 : }
671 : }
672 :
673 0 : if (start >= end)
674 0 : return NodeOffset(nsnull, -1);
675 :
676 : // 'start' is now the last mapping, if any, such that
677 : // mSoftTextOffset <= aSoftTextOffset.
678 : // If we're doing HINT_END, then we may want to return the end of the
679 : // the previous mapping instead of the start of this mapping
680 0 : if (aHint == HINT_END && start > 0) {
681 0 : const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
682 0 : if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
683 0 : return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
684 : }
685 :
686 : // We allow ourselves to return the end of this mapping even if we're
687 : // doing HINT_START. This will only happen if there is no mapping which this
688 : // point is the start of. I'm not 100% sure this is OK...
689 0 : const DOMTextMapping& map = mSoftTextDOMMapping[start];
690 0 : PRInt32 offset = aSoftTextOffset - map.mSoftTextOffset;
691 0 : if (offset >= 0 && offset <= map.mLength)
692 0 : return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
693 :
694 0 : return NodeOffset(nsnull, -1);
695 : }
696 :
697 : PRInt32
698 0 : mozInlineSpellWordUtil::FindRealWordContaining(PRInt32 aSoftTextOffset,
699 : DOMMapHint aHint, bool aSearchForward)
700 : {
701 0 : NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
702 0 : if (!mSoftTextValid)
703 0 : return -1;
704 :
705 : // The invariant is that the range start..end includes the last word,
706 : // if any, such that mSoftTextOffset <= aSoftTextOffset
707 0 : PRInt32 start = 0;
708 0 : PRInt32 end = mRealWords.Length();
709 0 : while (end - start >= 2) {
710 0 : PRInt32 mid = (start + end)/2;
711 0 : const RealWord& word = mRealWords[mid];
712 0 : if (word.mSoftTextOffset > aSoftTextOffset) {
713 0 : end = mid;
714 : } else {
715 0 : start = mid;
716 : }
717 : }
718 :
719 0 : if (start >= end)
720 0 : return -1;
721 :
722 : // 'start' is now the last word, if any, such that
723 : // mSoftTextOffset <= aSoftTextOffset.
724 : // If we're doing HINT_END, then we may want to return the end of the
725 : // the previous word instead of the start of this word
726 0 : if (aHint == HINT_END && start > 0) {
727 0 : const RealWord& word = mRealWords[start - 1];
728 0 : if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
729 0 : return start - 1;
730 : }
731 :
732 : // We allow ourselves to return the end of this word even if we're
733 : // doing HINT_START. This will only happen if there is no word which this
734 : // point is the start of. I'm not 100% sure this is OK...
735 0 : const RealWord& word = mRealWords[start];
736 0 : PRInt32 offset = aSoftTextOffset - word.mSoftTextOffset;
737 0 : if (offset >= 0 && offset <= word.mLength)
738 0 : return start;
739 :
740 0 : if (aSearchForward) {
741 0 : if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
742 : // All words have mSoftTextOffset > aSoftTextOffset
743 0 : return 0;
744 : }
745 : // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
746 : // Word start+1, if it exists, will be the first with
747 : // mSoftTextOffset > aSoftTextOffset.
748 0 : if (start + 1 < PRInt32(mRealWords.Length()))
749 0 : return start + 1;
750 : }
751 :
752 0 : return -1;
753 : }
754 :
755 : /*********** Word Splitting ************/
756 :
757 : // classifies a given character in the DOM word
758 : enum CharClass {
759 : CHAR_CLASS_WORD,
760 : CHAR_CLASS_SEPARATOR,
761 : CHAR_CLASS_END_OF_INPUT };
762 :
763 : // Encapsulates DOM-word to real-word splitting
764 : struct WordSplitState
765 0 : {
766 : mozInlineSpellWordUtil* mWordUtil;
767 : const nsDependentSubstring mDOMWordText;
768 : PRInt32 mDOMWordOffset;
769 : CharClass mCurCharClass;
770 :
771 0 : WordSplitState(mozInlineSpellWordUtil* aWordUtil,
772 : const nsString& aString, PRInt32 aStart, PRInt32 aLen)
773 : : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
774 0 : mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
775 :
776 : CharClass ClassifyCharacter(PRInt32 aIndex, bool aRecurse) const;
777 : void Advance();
778 : void AdvanceThroughSeparators();
779 : void AdvanceThroughWord();
780 :
781 : // Finds special words like email addresses and URLs that may start at the
782 : // current position, and returns their length, or 0 if not found. This allows
783 : // arbitrary word breaking rules to be used for these special entities, as
784 : // long as they can not contain whitespace.
785 : PRInt32 FindSpecialWord();
786 :
787 : // Similar to FindSpecialWord except that this takes a split word as
788 : // input. This checks for things that do not require special word-breaking
789 : // rules.
790 : bool ShouldSkipWord(PRInt32 aStart, PRInt32 aLength);
791 : };
792 :
793 : // WordSplitState::ClassifyCharacter
794 :
795 : CharClass
796 0 : WordSplitState::ClassifyCharacter(PRInt32 aIndex, bool aRecurse) const
797 : {
798 0 : NS_ASSERTION(aIndex >= 0 && aIndex <= PRInt32(mDOMWordText.Length()),
799 : "Index out of range");
800 0 : if (aIndex == PRInt32(mDOMWordText.Length()))
801 0 : return CHAR_CLASS_SEPARATOR;
802 :
803 : // this will classify the character, we want to treat "ignorable" characters
804 : // such as soft hyphens as word characters.
805 : nsIUGenCategory::nsUGenCategory
806 0 : charCategory = mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
807 0 : if (charCategory == nsIUGenCategory::kLetter ||
808 0 : IsIgnorableCharacter(mDOMWordText[aIndex]))
809 0 : return CHAR_CLASS_WORD;
810 :
811 : // If conditional punctuation is surrounded immediately on both sides by word
812 : // characters it also counts as a word character.
813 0 : if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
814 0 : if (!aRecurse) {
815 : // not allowed to look around, this punctuation counts like a separator
816 0 : return CHAR_CLASS_SEPARATOR;
817 : }
818 :
819 : // check the left-hand character
820 0 : if (aIndex == 0)
821 0 : return CHAR_CLASS_SEPARATOR;
822 0 : if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
823 0 : return CHAR_CLASS_SEPARATOR;
824 : // If the previous charatcer is a word-char, make sure that it's not a
825 : // special dot character.
826 0 : if (mDOMWordText[aIndex - 1] == '.')
827 0 : return CHAR_CLASS_SEPARATOR;
828 :
829 : // now we know left char is a word-char, check the right-hand character
830 0 : if (aIndex == PRInt32(mDOMWordText.Length()) - 1)
831 0 : return CHAR_CLASS_SEPARATOR;
832 0 : if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
833 0 : return CHAR_CLASS_SEPARATOR;
834 : // If the next charatcer is a word-char, make sure that it's not a
835 : // special dot character.
836 0 : if (mDOMWordText[aIndex + 1] == '.')
837 0 : return CHAR_CLASS_SEPARATOR;
838 :
839 : // char on either side is a word, this counts as a word
840 0 : return CHAR_CLASS_WORD;
841 : }
842 :
843 : // The dot character, if appearing at the end of a word, should
844 : // be considered part of that word. Example: "etc.", or
845 : // abbreviations
846 0 : if (aIndex > 0 &&
847 0 : mDOMWordText[aIndex] == '.' &&
848 0 : mDOMWordText[aIndex - 1] != '.' &&
849 0 : ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
850 0 : return CHAR_CLASS_WORD;
851 : }
852 :
853 : // all other punctuation
854 0 : if (charCategory == nsIUGenCategory::kSeparator ||
855 : charCategory == nsIUGenCategory::kOther ||
856 : charCategory == nsIUGenCategory::kPunctuation ||
857 : charCategory == nsIUGenCategory::kSymbol) {
858 : // Don't break on hyphens, as hunspell handles them on its own.
859 0 : if (aIndex > 0 &&
860 0 : mDOMWordText[aIndex] == '-' &&
861 0 : mDOMWordText[aIndex - 1] != '-' &&
862 0 : ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
863 : // A hyphen is only meaningful as a separator inside a word
864 : // if the previous and next characters are a word character.
865 0 : if (aIndex == PRInt32(mDOMWordText.Length()) - 1)
866 0 : return CHAR_CLASS_SEPARATOR;
867 0 : if (mDOMWordText[aIndex + 1] != '.' &&
868 0 : ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
869 0 : return CHAR_CLASS_WORD;
870 : }
871 0 : return CHAR_CLASS_SEPARATOR;
872 : }
873 :
874 : // any other character counts as a word
875 0 : return CHAR_CLASS_WORD;
876 : }
877 :
878 :
879 : // WordSplitState::Advance
880 :
881 : void
882 0 : WordSplitState::Advance()
883 : {
884 0 : NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
885 0 : NS_ASSERTION(mDOMWordOffset < (PRInt32)mDOMWordText.Length(),
886 : "Length beyond end");
887 :
888 0 : mDOMWordOffset ++;
889 0 : if (mDOMWordOffset >= (PRInt32)mDOMWordText.Length())
890 0 : mCurCharClass = CHAR_CLASS_END_OF_INPUT;
891 : else
892 0 : mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
893 0 : }
894 :
895 :
896 : // WordSplitState::AdvanceThroughSeparators
897 :
898 : void
899 0 : WordSplitState::AdvanceThroughSeparators()
900 : {
901 0 : while (mCurCharClass == CHAR_CLASS_SEPARATOR)
902 0 : Advance();
903 0 : }
904 :
905 : // WordSplitState::AdvanceThroughWord
906 :
907 : void
908 0 : WordSplitState::AdvanceThroughWord()
909 : {
910 0 : while (mCurCharClass == CHAR_CLASS_WORD)
911 0 : Advance();
912 0 : }
913 :
914 :
915 : // WordSplitState::FindSpecialWord
916 :
917 : PRInt32
918 0 : WordSplitState::FindSpecialWord()
919 : {
920 : PRInt32 i;
921 :
922 : // Search for email addresses. We simply define these as any sequence of
923 : // characters with an '@' character in the middle. The DOM word is already
924 : // split on whitepace, so we know that everything to the end is the address
925 : //
926 : // Also look for periods, this tells us if we want to run the URL finder.
927 0 : bool foundDot = false;
928 0 : PRInt32 firstColon = -1;
929 0 : for (i = mDOMWordOffset;
930 0 : i < PRInt32(mDOMWordText.Length()); i ++) {
931 0 : if (mDOMWordText[i] == '@') {
932 : // only accept this if there are unambiguous word characters (don't bother
933 : // recursing to disambiguate apostrophes) on each side. This prevents
934 : // classifying, e.g. "@home" as an email address
935 :
936 : // Use this condition to only accept words with '@' in the middle of
937 : // them. It works, but the inlinespellcker doesn't like this. The problem
938 : // is that you type "fhsgfh@" that's a misspelled word followed by a
939 : // symbol, but when you type another letter "fhsgfh@g" that first word
940 : // need to be unmarked misspelled. It doesn't do this. it only checks the
941 : // current position for potentially removing a spelling range.
942 0 : if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
943 0 : i < (PRInt32)mDOMWordText.Length() - 1 &&
944 0 : ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD)
945 :
946 0 : return mDOMWordText.Length() - mDOMWordOffset;
947 0 : } else if (mDOMWordText[i] == '.' && ! foundDot &&
948 0 : i > 0 && i < (PRInt32)mDOMWordText.Length() - 1) {
949 : // we found a period not at the end, we should check harder for URLs
950 0 : foundDot = true;
951 0 : } else if (mDOMWordText[i] == ':' && firstColon < 0) {
952 0 : firstColon = i;
953 : }
954 : }
955 :
956 : // If the first colon is followed by a slash, consider it a URL
957 : // This will catch things like asdf://foo.com
958 0 : if (firstColon >= 0 && firstColon < (PRInt32)mDOMWordText.Length() - 1 &&
959 0 : mDOMWordText[firstColon + 1] == '/') {
960 0 : return mDOMWordText.Length() - mDOMWordOffset;
961 : }
962 :
963 : // Check the text before the first colon against some known protocols. It
964 : // is impossible to check against all protocols, especially since you can
965 : // plug in new protocols. We also don't want to waste time here checking
966 : // against a lot of obscure protocols.
967 0 : if (firstColon > mDOMWordOffset) {
968 : nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
969 0 : firstColon - mDOMWordOffset));
970 0 : if (protocol.EqualsIgnoreCase("http") ||
971 0 : protocol.EqualsIgnoreCase("https") ||
972 0 : protocol.EqualsIgnoreCase("news") ||
973 0 : protocol.EqualsIgnoreCase("file") ||
974 0 : protocol.EqualsIgnoreCase("javascript") ||
975 0 : protocol.EqualsIgnoreCase("ftp")) {
976 0 : return mDOMWordText.Length() - mDOMWordOffset;
977 : }
978 : }
979 :
980 : // not anything special
981 0 : return -1;
982 : }
983 :
984 : // WordSplitState::ShouldSkipWord
985 :
986 : bool
987 0 : WordSplitState::ShouldSkipWord(PRInt32 aStart, PRInt32 aLength)
988 : {
989 0 : PRInt32 last = aStart + aLength;
990 :
991 : // check to see if the word contains a digit
992 0 : for (PRInt32 i = aStart; i < last; i ++) {
993 0 : PRUnichar ch = mDOMWordText[i];
994 : // XXX Shouldn't this be something a lot more complex, Unicode-based?
995 0 : if (ch >= '0' && ch <= '9')
996 0 : return true;
997 : }
998 :
999 : // not special
1000 0 : return false;
1001 : }
1002 :
1003 : // mozInlineSpellWordUtil::SplitDOMWord
1004 :
1005 : void
1006 0 : mozInlineSpellWordUtil::SplitDOMWord(PRInt32 aStart, PRInt32 aEnd)
1007 : {
1008 0 : WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
1009 0 : state.mCurCharClass = state.ClassifyCharacter(0, true);
1010 :
1011 0 : while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1012 0 : state.AdvanceThroughSeparators();
1013 0 : if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
1014 0 : break;
1015 :
1016 0 : PRInt32 specialWordLength = state.FindSpecialWord();
1017 0 : if (specialWordLength > 0) {
1018 : mRealWords.AppendElement(
1019 0 : RealWord(aStart + state.mDOMWordOffset, specialWordLength, false));
1020 :
1021 : // skip the special word
1022 0 : state.mDOMWordOffset += specialWordLength;
1023 0 : if (state.mDOMWordOffset + aStart >= aEnd)
1024 0 : state.mCurCharClass = CHAR_CLASS_END_OF_INPUT;
1025 : else
1026 0 : state.mCurCharClass = state.ClassifyCharacter(state.mDOMWordOffset, true);
1027 0 : continue;
1028 : }
1029 :
1030 : // save the beginning of the word
1031 0 : PRInt32 wordOffset = state.mDOMWordOffset;
1032 :
1033 : // find the end of the word
1034 0 : state.AdvanceThroughWord();
1035 0 : PRInt32 wordLen = state.mDOMWordOffset - wordOffset;
1036 : mRealWords.AppendElement(
1037 : RealWord(aStart + wordOffset, wordLen,
1038 0 : !state.ShouldSkipWord(wordOffset, wordLen)));
1039 : }
1040 0 : }
|