User:PerfektesChaos/WikidiffLX/coding/WikidiffLX.cpp
Based on wikidiff2/wikidiff2.cpp (rev 83886 Mar 2011).
Declarations in WikidiffLX.h
- Renamed following the new generation name
- Added many methods
Contents |
head [edit]
/** * Diff formatter, based on code by Steinar H. Gunderson, * converted to work with the Dairiki diff engine by Tim Starling (Wikidiff2) * * Extended for WikidiffLX by PerfektesChaos@de.wikipedia 2011 * GPL. */ #include "WikidiffLX.h" #include <string> #include <thai/thailib.h> #include <thai/thwchar.h> #include <thai/thbrk.h> #include <stdio.h> #define SPAN_INLINE "<span class=\"diffchange diffchange-inline\">" #define TD_LEFT "<tr>\n <td colspan=\"2\"> " #define TD_RIGHT "</td>\n" \ " <td class=\"diff-marker\">+</td>\n" \ " <td class=\"diff-addedline\">" #define TR_LEFT "<tr>\n <td class=\"diff-marker\">−</td>\n" \ " <td class=\"diff-deletedline\">"
public [edit]
execute() [edit]
Based on wikidiff2 with following changes:
- merged with former diffLines()
- delegating HTML output to printLines()
Major task controlling line separation, diff engine, postprocessing engine result, formatting HTML.
const WikidiffLX::String & WikidiffLX::execute(const String & text1, const String & text2, int numContextLines) { numberContextL = numContextLines; // Allocate some result space to avoid excessive copying result.clear(); result.reserve(text1.size() + text2.size() + MAX_DIFF_LINE); // Split input strings into lines LineVector lines1; LineVector lines2; explodeLines(text1, lines1); explodeLines(text2, lines2); // Do the diff LineDiff linediff(lines1, lines2); // Transfer engine result into lines diffResult(linediff); // Turn whitespace only differences into changes diffWhitespace(lines1); // Adjust virtual line segmentation recoverParagraphs(lines1, lines2); // Format line differences as HTML printLines(lines1, lines2); // Return a reference to the result buffer return result; } // execute()
private [edit]
diffResult() [edit]
New methodology for postprocessing of virtual lines. Result of engine is transferred from LineDiff into Line objects. Note that the two LineVector collections are accessed by their pointers in DiffOp.
The block structure of combined op codes in LineDiff is part of the engine algorithm, located deep inside. That should not be changed. It would affect Word presentation as well as possible exchange of the engine. The Diff object does not permit array manipulation. The least confusing way is to transfer the engine result into the Line objects.
void WikidiffLX::diffResult(LineDiff diffs) { // Transfer engine result into Line objects const size_t nDiff = diffs.size(); size_t nBoth; size_t nFrom; size_t nTo; size_t j; DiffOp<Line> *diff; LinePVector fv; LinePVector tv; Line *f; Line *t; for (size_t i = 0; i < nDiff; i++) { diff = &diffs[i]; switch (diff->op) { case DiffOp<Line>::copy : case DiffOp<Line>::change : fv = diff->from; tv = diff->to; nFrom = fv.size(); nTo = tv.size(); nBoth = std::min(nFrom, nTo); for (j = 0; j < nBoth; j++) { f = (Line *)fv[j]; t = (Line *)tv[j]; f->set_diff(diff->op, t); t->set_diff(diff->op, f); } // for j for (j = nBoth; j < nFrom; j++) { f = (Line *)fv[j]; f->set_diff(DiffOp<Line>::del); } // for j for (j = nBoth; j < nTo; j++) { t = (Line *)tv[j]; t->set_diff(DiffOp<Line>::add); } // for j break; case DiffOp<Line>::del : fv = diff->from; nFrom = fv.size(); for (j = 0; j < nFrom; j++) { f = (Line *)fv[j]; f->set_diff(DiffOp<Line>::del); } // for j break; case DiffOp<Line>::add : tv = diff->to; nTo = tv.size(); for (j = 0; j < nTo; j++) { t = (Line *)tv[j]; t->set_diff(DiffOp<Line>::add); } // for j break; } // switch .op } // for i } // diffResult()
diffWhitespace() [edit]
New methodology for trailing whitespace at end of line and invisible trailing lines. Major part of work is done within Line objects.
Note that this is called once for from only but does affect the to counterparts also.
void WikidiffLX::diffWhitespace(LineVector from) { // detect whitespace only differences and translate into changes const size_t n = from.size(); for (size_t i = 0; i < n; i++) { from[i].whitespaceOnly(); } // for i } // diffWhitespace()
explodeLines() [edit]
Based on explodeLines() in wikidiff2 but recognizing breaks of virtual lines. Detect invisible trailing lines, separate from lines with visible content and hide them in Line objects.
Usage of allocated String for line text replaced by iterators.
void WikidiffLX::explodeLines(const String & text, LineVector & lines) { const Iterator pE = text.end(); Iterator pB = text.begin(); Iterator p; Iterator pH; Iterator pV; Iterator pS; Iterator pT; unsigned char b; #ifndef NO_LINE_NUMBERS size_t k = 1; size_t keep = 1; #endif while (pB != pE) { pH = std::find(pB, pE, '\n'); while (pH - pB > LINELENGTH_VIRTUAL) { pT = pH - LINELENGTH_MIN; findVirtualBreak(pB + LINELENGTH_MIN, pT, &pV); if (pV == pT) { // not found break; // while } else { pS = pV; while (pS != pE) { b = (unsigned char)*pS; if (b == 0x20) { // or other invisibles pS++; } else { break; // while } } // while if (pH - pS > LINELENGTH_MIN) { #ifdef NO_LINE_NUMBERS lines.push_back(Line(pB, pV, pS)); #else lines.push_back(Line(pB, pV, pS, keep)); #endif pB = pS; } else { break; // while } } } // while pH - pB > LINELENGTH_VIRTUAL pS = pH; p = pH; while (p > pB) { p--; b = (unsigned char)*p; if (b == 0x20) { // or other invisibles pS = p; } else { break; // while } } // while suffix if (pH == pE) { pT = pE; } else { pT = pH; p = pT; while (true) { if (p == pE) { if (pH == pB) { pH = pE; } pT = pE; break; // while } else { b = (unsigned char)*p; if (b > 0x20) { // may be other invisibles (Unicode) break; // while } else if (b == 0x0A) { #ifndef NO_LINE_NUMBERS if (pT != pH) { k++; } #endif pT = p + 1; } } p++; } // while trailing } #ifdef NO_LINE_NUMBERS lines.push_back(Line(pB, pS, pH, pT)); #else lines.push_back(Line(pB, pS, pH, pT, keep)); k++; keep = k; #endif pB = pT; if (pT != pE) { pT++; } } // while pB != pE } // explodeLines()
explodeWords() [edit]
Based on explodeWords() in wikidiff2 but completely rewritten and accelerated. Handling of Unicode spaces introduced. Thai processing started only if really Thai character. Every CJK symbol is treated as single word now.
Not yet included:
- rev:107135 possible future direction: ICU.
void WikidiffLX::explodeWords(const String & text, WordVector &words) { // Split a string into words const Iterator pE = text.end(); Iterator p = text.begin(); // Don't try to do a word-level diff on very long lines if (text.size() > MAX_DIFF_LINE) { words.push_back(Word(p, pE, pE)); return; } Iterator chStart; Iterator suffixStart; Iterator wordStart; unsigned int ch; unsigned char b; bool livingSuffix = false; bool livingWord = false; bool locateSuffix; while (p < pE) { chStart = p; locateSuffix = false; b = (unsigned char)*p; if (b < 0xC0) { // ASCII or undefined if (b < 0x21) { locateSuffix = (b == 0x20 || 0x09); // some more ... } } else { // UTF range ch = nextUtf8Char(p, b, pE); if (ch > 0x0E00) { // could be interesting if (ch > 0x0E00 && ch <= 0x0E5F) { // Thai explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, chStart, words); wordStart = chStart; explodeWordsThai(ch, p, pE, words, wordStart); } else if (ch >= 0x2002 && ch <= 0x200A) { // spaces locateSuffix = true; } else if (ch >= 0x1100 && ch <= 0x11FF || ch >= 0x2E80 && ch <= 0x9FFF || ch >= 0xAC00 && ch <= 0xD7AF || ch >= 0x20000 && ch <= 0x2FA1F) { // CJK explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, chStart, words); wordStart = chStart; words.push_back(Word(chStart, p, p)); } } } if (locateSuffix) { if (! livingSuffix) { suffixStart = chStart; livingSuffix = true; } } else if (livingSuffix) { explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, p, words); } else if (! livingWord) { wordStart = chStart; livingWord = true; } ++p; } // while (p < pE) explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, pE, words); } // explodeWords() void WikidiffLX::explodeWordSuffix(bool & livingWord, const Iterator wordStart, bool & livingSuffix, const Iterator suffixStart, const Iterator wordNext, WordVector & words) { // Add word body and suffix interval to words, if any; reset living if (livingWord) { // regular word pending if (wordStart < wordNext) { words.push_back(Word(wordStart, (livingSuffix?suffixStart:wordNext), wordNext)); } } else if (livingSuffix) { // append to previous CJK or Thai word if (! words.empty()) { // not the beginning of the line words.pop_back(); words.push_back(Word(wordStart, suffixStart, wordNext)); } } livingWord = false; livingSuffix = false; } // explodeWordSuffix()
explodeWordsThai() [edit]
Re-written procedure for handling of substring containing thai characters only in Thai line splitting into words within explodeWords().
void WikidiffLX::explodeWordsThai(unsigned int ch, Iterator & p, const Iterator pE, WordVector & words, Iterator & wordStart) { // Pointing on thai character sequence and add particular thai words. // Thai chars are in U+0E00...0E5F each represented by three bytes UTF8. IntVector thaiBreakPositions; String tisText; Iterator pT = p; Iterator p0 = wordStart; int nBreaks; size_t max = (pE - p) / 3 + 1; unsigned char b; tisText.reserve(max); tisText = th_uni2tis(ch); while (pT < pE) { b = (unsigned char)*pT; if (b == 0xC0) { ch = nextUtf8Char(pT, b, pE); if (ch > 0x0E00 && ch <= 0x0E5F) { tisText += th_uni2tis(ch); p = pT; pT++; } else { // not in Thai range break; // while } } else { // out of relevant UTF8 break; // while } } // while (pT < pE) tisText += '\0'; max = tisText.size(); thaiBreakPositions.resize(max); nBreaks = th_brk((const thchar_t*)(tisText.data()), &thaiBreakPositions[0], max); for (int i = 0; i < nBreaks; i++) { pT = p0 + 3 * thaiBreakPositions[i]; words.push_back(Word(wordStart, pT, pT)); wordStart = pT; } // for i } // explodeWordsThai()
findVirtualBreak() [edit]
New method for detection of breaks of virtual lines within explodeLines().
void WikidiffLX::findPunctSpace(const Iterator pB, const Iterator pE, const unsigned char punct, Iterator * p) { // Detect punct+space; return point after punct or pE Iterator pR = pE; unsigned char b; *p = std::find(pB, pE, punct); while (*p != pE) { *p++; if (*p != pE) { b = (unsigned char)**p; if (b == 0x20) { break; // while } else { *p = std::find(*p, pE, punct); } } } } // findPunctSpace() void WikidiffLX::findVirtualBreak(const Iterator pB, const Iterator pE, Iterator * p) { // Detect virtual line break; return point after terminator *p = pE; findPunctSpace(pB, *p, '.', p); findPunctSpace(pB, *p, '?', p); findPunctSpace(pB, *p, '!', p); /* Other candidates, requiring UTF8 decoding: * U+3002 CJK period */ } // findVirtualBreak()
nextUtf8Char() [edit]
Based on wikidiff2 but demanding 32bit characters explicitly.
unsigned int WikidiffLX::nextUtf8Char(Iterator & p, unsigned char b, const Iterator pE) { // Weak UTF-8 decoder // Will return garbage on invalid input (overshort sequences, overlong sequences, etc.) // but Mediawiki never provides bad UTF-8 texts ... short seqLength = 0; unsigned int c = 0; while (p < pE) { if (b < 0x80) { c = b; seqLength = 0; } else if (b >= 0xC0) { // Start of UTF-8 character // If this is unexpected, due to an overshort sequence, // we ignore the invalid sequence and resynchronise here if (b < 0xE0) { seqLength = 1; c = b & 0x1F; } else if (b < 0xF0) { seqLength = 2; c = b & 0x0F; } else { seqLength = 3; c = b & 7; } } else if (seqLength) { c <<= 6; c |= b & 0x3F; --seqLength; } else { // Unexpected continuation, ignore } ++p; if (! seqLength) { break; // while } } // while (p < pE) return c; } // nextUtf8Char()
printLines() [edit]
Mainly new code for the presentation of paragraphs. The respective diffLines() procedure in wikidiff2 is replaced by printLines(). Handling of virtual line sequences is implemented by filling table cells until hard break encountered. Trailing invisible lines are discovered and presented as table cells where necessary.
void WikidiffLX::printLines(const LineVector from, const LineVector to) { // Print all diff into result as HTML // linediff is adjusted with hard breaks at .op borders // .op=change lines are merged as single paragraphs const size_t nFrom = from.size(); const size_t nTo = to.size(); size_t iFrom = 0; size_t iTo = 0; bool leap = from[0].is_Copy() && to[0].is_Copy(); while (iFrom < nFrom && iTo < nTo) { if (leap) { printLinesContext(from, to, nFrom, nTo, &iFrom, &iTo); } else { printLinesDiff(from, to, nFrom, nTo, &iFrom, &iTo); } leap = ! leap; } // while } // printLines() void WikidiffLX::printLinesContext(const LineVector from, const LineVector to, const size_t nFrom, const size_t nTo, size_t * iFrom, size_t * iTo) { // Print unchanged lines as context before / after diff // *iFrom,*iTo changing from start of copy block to end of copy block const size_t jFrom = *iFrom; // end of previous differing const size_t jTo = *iTo; // end with following differing size_t k; bool latest = true; // last block at all for ( ; *iFrom < nFrom; *iFrom++) { if (! from[*iFrom].is_Copy()) { latest = false; break; // for *iFrom } } // for *iFrom for ( ; *iTo < nTo; *iTo++) { if (! to[*iTo].is_Copy()) { latest = false; break; // for *iTo } } // for *iTo if (jFrom || jTo) { // pending trailer required k = jFrom + numberContextL; if (latest) { printLinesContextRows(from, *iFrom, std::min(k, nFrom)); } else { // differing block follows if (*iFrom - jFrom > 2 * numberContextL + 1) { // separate printLinesContextRows(from, jFrom, jFrom + numberContextL); k = *iFrom - numberContextL; #ifndef NO_LINE_NUMBERS printLineNumber(from[k], to[*iTo - numberContextL]); #endif printLinesContextRows(from, k, *iFrom); } else { // one block printLinesContextRows(from, jFrom, *iFrom); } } } else if (! latest) { // first lines not differing, but later k = (*iFrom > numberContextL ? *iFrom - numberContextL : 0); #ifndef NO_LINE_NUMBERS printLineNumber(from[k], to[k]); #endif printLinesContextRows(from, k, *iFrom); } } // printLinesContext() void WikidiffLX::printLinesContextRows(const LineVector context, const size_t iBeg, const size_t iEnd) { size_t i; size_t k = iBeg; for (i = iBeg; i < iEnd; i++) { if (context[i].is_HardBreak()) { printLinesContextRow(context, k, i); k = i; } } // for i if (k < iEnd) { printLinesContextRow(context, k, iEnd); } } // printLinesContextRows() void WikidiffLX::printLinesContextRow(const LineVector context, const size_t iBeg, const size_t iEnd) { // Print paragraph for context <tr ...</tr> size_t i; result += "<tr>\n" " <td class=\"diff-marker\"> </td>\n" " <td class=\"diff-context\"><div>"; for (i = iBeg; i < iEnd; i++) { if (i != iBeg) { result += " "; // drop invisible whitespace, pay for a space } printText(context[i].get_body()); } // for i result += "</div></td>\n" " <td class=\"diff-marker\"> </td>\n" " <td class=\"diff-context\"><div>"; for (i = iBeg; i < iEnd; i++) { if (i != iBeg) { result += " "; // drop invisible whitespace, pay for a space } printText(context[i].get_body()); } // for i result += "</div></td>\n</tr>\n"; } // printLinesContextRow() #ifndef NO_LINE_NUMBERS void WikidiffLX::printLineNumber(const Line f, const Line t) { snprintf(scratch256, 256, "<tr>\n" " <td colspan=\"2\" class=\"diff-lineno\"><!--LINE %u--></td>\n" " <td colspan=\"2\" class=\"diff-lineno\"><!--LINE %u--></td>\n" "</tr>\n", f.get_lineNumber(), t.get_lineNumber()); result += scratch256; } // printLineNumber() #endif void WikidiffLX::printLinesDiff(const LineVector from, const LineVector to, const size_t nFrom, const size_t nTo, size_t * iFrom, size_t * iTo) { // Print sequence of differing lines int mL; int mR; while (*iFrom < nFrom || *iTo < nTo) { mL = DiffOp<Line>::copy; mR = DiffOp<Line>::copy; if (*iFrom < nFrom) { mL = from[*iFrom].get_diffCode(); if (mL == DiffOp<Line>::del) { mL = printLinesDelete(from, nFrom, iFrom); } } if (*iTo < nTo) { mR = to[*iTo].get_diffCode(); if (mR == DiffOp<Line>::del) { mR = printLinesAdd(to, nTo, iTo); } } if (mL == DiffOp<Line>::copy && mR == DiffOp<Line>::copy) { break; // while } else { printLinesChange(from, to, nFrom, nTo, iFrom, iTo); } } // while <nFrom && <nTo } // printLinesDiff() int WikidiffLX::printLineSingleCell(const LineVector vector, const size_t n, size_t * i) { // Print table cell for 'del' or 'add' // Return number of trailing lines const Line *e; size_t more = 0; while (*i < n) { e = &vector[*i]; (*i)++; printText(e->get_body()); if (e->is_HardBreak()) { more = e->get_trailingCount(); break; // while } else { result += " "; } } // while true result += "</div></td>\n</tr>\n"; return more; } // printLineSingleCell() int WikidiffLX::printLinesDelete(const LineVector from, const size_t nFrom, size_t * iFrom) { // Print blocks of deleted lines starting at from[*iFrom] // Return next opcode size_t more; int next = DiffOp<Line>::copy; while (*iFrom < nFrom) { next = from[*iFrom].get_diffCode(); if (next == DiffOp<Line>::copy) { break; // while } else { result += TR_LEFT "<div>"; more = printLineSingleCell(from, nFrom, iFrom); while (more) { result += TR_LEFT " </td>\n</tr>\n"; more--; } // while } } // while <nFrom return next; } // printLinesDelete() int WikidiffLX::printLinesAdd(const LineVector to, const size_t nTo, size_t * iTo) { // Print blocks of added lines starting at to[*iTo] // Return next opcode size_t more; int next = DiffOp<Line>::copy; while (*iTo < nTo) { next = to[*iTo].get_diffCode(); if (next == DiffOp<Line>::copy) { break; // while } else { result += TD_LEFT TD_RIGHT "<div>"; more = printLineSingleCell(to, nTo, iTo); while (more) { result += TD_LEFT TD_RIGHT " </td>\n</tr>\n"; more--; } // while } } // while <nTo return next; } // printLinesAdd() void WikidiffLX::printLinesChange(const LineVector from, const LineVector to, const size_t nFrom, const size_t nTo, size_t * iFrom, size_t * iTo) { // Print pairs of blocks of 'change' starting at from[*iFrom]/to[*iTo] int mL; int mR; while (*iFrom < nFrom && *iTo < nTo) { mL = from[*iFrom].get_diffCode(); mR = to[*iTo].get_diffCode(); if (mL == DiffOp<Line>::change && mR == DiffOp<Line>::change) { printLineChangeRow(from, to, nFrom, nTo, iFrom, iTo); } else if (mL == DiffOp<Line>::change) { printLinesDelete(from, nFrom, iFrom); } else if (mR == DiffOp<Line>::change) { printLinesAdd(to, nTo, iTo); } else { break; // while } } // while <nFrom && <nTo while (*iFrom < nFrom) { if (from[*iFrom].get_diffCode() == DiffOp<Line>::change) { printLinesDelete(from, nFrom, iFrom); } else { break; // while } } // while <nFrom while (*iTo < nTo) { if (to[*iTo].get_diffCode() == DiffOp<Line>::change) { printLinesAdd(to, nTo, iTo); } else { break; // while } } // while <nTo } // printLinesChange() void WikidiffLX::printLineChangeRow(const LineVector from, const LineVector to, const size_t nFrom, const size_t nTo, size_t * iFrom, size_t * iTo) { // Print table cells for 'change' // Save worddiff results from left cell for right cell WordsVector wordsTo; WordDiffVector worddiffs; result += TR_LEFT; size_t moreL = printLineChange(from, nFrom, true, iFrom, &wordsTo, &worddiffs); result += TD_RIGHT; size_t moreR = printLineChange(to, nTo, false, iTo, &wordsTo, &worddiffs); result += "</td>\n</tr>\n"; if (moreL + moreR) { const Line *eL = &from[(*iFrom)-1]; const Line *eR = &to[(*iTo)-1]; size_t m; size_t min = std::min(moreL, moreR); for (m = 0; m < min; m++) { result += TR_LEFT; if (eL->equals_trailing(eR, m)) { result += " " TD_RIGHT " "; } else { printSpacediff(eL->get_trailingLength(m)); result += TD_RIGHT; printSpacediff(eR->get_trailingLength(m)); } result += "</td>\n</tr>\n"; } // for m for (m = min; m < moreL; m++) { result += TR_LEFT " </td>\n</tr>\n"; } // for m for (m = min; m < moreR; m++) { result += TD_LEFT TD_RIGHT " </td>\n</tr>\n"; } // for m } } // printLineChangeRow() int WikidiffLX::printLineChange(const LineVector change, const size_t n, const bool left, size_t * i, WordsVector * wordsTo, WordDiffVector * worddiffs) { // Print one table cell for 'change' // Return number of trailing lines const Line *e; const Line *f; size_t more = 0; size_t kDiff = 0; result += "<div>"; while (*i < n) { e = &change[*i]; (*i)++; f = e->get_counterPart(); if (f) { if (e->equals_body()) { printText(e->get_body()); if (! e->equals_suffix(f)) { printSpacediff(e->get_suffixLength()); } } else if (left) { WordVector words1; wordsTo->push_back(WordVector()); explodeWords(e->get_bodyBegin(), e->get_bodyEnd(), words1); explodeWords(f->get_bodyBegin(), f->get_bodyEnd(), (*wordsTo)[kDiff]); worddiffs->push_back(WordDiff(words1, (*wordsTo)[kDiff])); printWordDiff((*worddiffs)[kDiff], false); kDiff++; } else { printWordDiff((*worddiffs)[kDiff], true); kDiff++; } } else { // virtual line without counterPart result += SPAN_INLINE; printText(e->get_body()); result += "</span>"; } if (e->is_HardBreak()) { more = e->get_trailingCount(); break; // while } else { result += " "; } } // while true result += "</div>"; return more; } // printLineChange() void WikidiffLX::printSpacediff(const size_t n) { // Print a SPAN with n whitespace replacement characters result += SPAN_INLINE; for (size_t i = 0; i < n; i++) { result += SPACE_DIFF; } // for i result += "</span>"; } // printSpacediff()
printText() [edit]
Based on wikidiff2.
- Use printText() unchanged.
- Added printTextRed() basically like printText() but visualize zero-width differences within a red word.
void WikidiffLX::printText(const String & input) { size_t start = 0; size_t end = input.find_first_of("<>&"); while (end != String::npos) { if (end > start) { result.append(input, start, end - start); } switch (input[end]) { case '<': result.append("<"); break; case '>': result.append(">"); break; default /*case '&'*/: result.append("&"); } start = end + 1; end = input.find_first_of("<>&", start); } // Append the rest of the string after the last special character if (start < input.size()) { result.append(input, start, input.size() - start); } } // printText() void WikidiffLX::printTextRed(const String & text) { // HTML source code in UTF-8, general colour red, zero-width chars const Iterator pE = text.end(); Iterator p = text.begin(); size_t i = 0; size_t j = 0; size_t k = 0; wchar_t ch; unsigned char b; while (p < pE) { b = (unsigned char)*p; j++; if (b < 0xC0) { // ASCII or undefined if (b == 0x26 || b == 0x3D) { // & < k = j; j--; } } else { // UTF range ch = nextUtf8Char(p, b, pE); if (ch == 0xAD || ch >= 0x200B && ch <= 0x200F || ch >= 0x202A && ch <= 0x202E) { k = j + (ch < 0x0800 ? 1 : 2); j--; } } if (k) { if (j > i) { result.append(text, i, j - i); } if (b == 0x26) { result.append("&"); } else if (b == 0x3D) { result.append("<"); } else { // UTF result.append(SPACE_DIFF); } i = k; j = k; k = 0; } } // while (p < pE) if (j > i) { result.append(text, i, j - i); } } // printTextRed()
printWordDiff() [edit]
Based on wikidiff2.
- Added printWordDiffSideBlack()
- Modified printWordDiffSide() to reflect whether both adjacent words are black and shall present whitespace differences.
void WikidiffLX::printWordDiff(WordDiff worddiff, const bool added) { const DiffOp<Word> * op; DiffOp<Word>::PointerVector current; DiffOp<Word>::PointerVector others; String word; const size_t nw = worddiff.size(); const size_t nwb = nw - 1; const size_t nwb2 = (nwb ? nw - 2 : 0); size_t n, j; bool lastBlack; for (size_t i = 0; i < nw; ++i) { op = &worddiff[i]; if (op->op == DiffOp<Word>::copy) { lastBlack = (i < nwb); if (added) { current = op->to; others = op->from; } else { current = op->from; others = op->to; } if (lastBlack) { switch (worddiff[i+1].op) { case DiffOp<Word>::copy : lastBlack = false; break; case DiffOp<Word>::del : if (added) { if (i < nwb2) { lastBlack = (worddiff[i+2].op == DiffOp<Word>::copy); } else { lastBlack = false; } } break; case DiffOp<Word>::add : if (! added) { if (i < nwb2) { lastBlack = (worddiff[i+2].op == DiffOp<Word>::copy); } else { lastBlack = false; } } break; case DiffOp<Word>::change : // skip break; } // switch } printWordDiffBlack(current, others, lastBlack, word); } else if (!added && (op->op == DiffOp<Word>::del || op->op == DiffOp<Word>::change)) { n = op->from.size(); result += SPAN_INLINE; for (j = 0; j < n; j++) { op->from[j]->get_whole(word); printText(word); } result += "</span>"; } else if (added && (op->op == DiffOp<Word>::add || op->op == DiffOp<Word>::change)) { n = op->to.size(); result += SPAN_INLINE; for (j = 0; j < n; j++) { op->to[j]->get_whole(word); printText(word); } result += "</span>"; } } } // printWordDiff() void WikidiffLX::printWordDiffBlack(const DiffOp<Word>::PointerVector current, const DiffOp<Word>::PointerVector others, const bool lastBlack, String & word) { // Print a block of words in black, but possibly with space diffs. // The last black word before a red one is indicated by lastBlack. size_t k; const size_t n = current.size(); const size_t meet = (lastBlack ? n-1 : n+1); const Word *item; for (size_t j = 0; j < n; j++) { item = current[j]; if (j == meet) { item->get_whole(word); printText(word); } else { if (item->equals_suffix(others[j])) { item->get_whole(word); printText(word); } else { // space diff k = item->get_suffixlength(); // may be limited to a certain number of spaces if (k == 0) { // last word in line item->get_whole(word); printText(word); } else { item->get_body(word); printText(word); result += SPAN_INLINE; for (size_t i = 0; i < k; i++) { result += SPACE_DIFF; } // for i result += "</span>"; } } } } // for j } // printWordDiffBlack()
recoverParagraphs() [edit]
New methodology for postprocessing of virtual lines.
- Precondition: virtual lines might change op code within paragraph
- Postcondition: every paragraph has a unique op code
void WikidiffLX::recoverParagraphs(LineVector from, LineVector to) { bool learning; bool left = true; diffAdjustSingle(from, DiffOp<Line>::del); diffAdjustSingle(to, DiffOp<Line>::add); diffIncompleteChanges(from); learning = diffIncompleteChanges(to); while (learning) { learning = diffIncompleteChanges((left ? from : to)); left = ! left; } // while learning } // recoverParagraphs() void WikidiffLX::diffAdjustSingle(LineVector vector, const int opMerge) { // Ensure that the entire paragraph containing opMerge is either // * single paragraph opMerge only // * 'change' (if mixed virtual lines) Line *e; size_t k; size_t m = 0; const size_t n = vector.size(); bool leap = false; for (size_t i = 0; i < n; i++) { e = &vector[i]; if (e->is_HardBreak()) { m = i + 1; } if (e->get_diffCode() == opMerge) { leap = false; for (k = m; k < n; k++) { e = &vector[k]; if (e->get_diffCode() != opMerge) { leap = true; } if (e->is_HardBreak()) { break; } } // for k if (k != m) { // virtual lines if (leap) { // / mixed op i = k; for (k = m; k < i; k++) { e = &vector[k]; e->set_diffCopyChange(); } // for k } } } } // for i } // diffAdjustSingle() bool WikidiffLX::diffIncompleteChanges(LineVector vector) { // Unify paragraphs with partial change code const size_t n = vector.size(); Line *e; Line *f; size_t k; size_t m = 0; bool learning = false; for (size_t i = 0; i < n; i++) { e = &vector[i]; if (e->is_HardBreak()) { m = i + 1; } if (e->is_Copy()) { for (k = m; k < n; k++) { e = &vector[k]; f = e->get_counterPart(); if (f->is_Change()) { e->set_diffCopyChange(); learning = true; } if (e->is_HardBreak()) { i = k; break; } } // for k } } // for i return learning; } // diffIncompleteChanges()