UNPKG

diff

Version:

A JavaScript text diff implementation.

kpdecker/jsdiff

516 lines (515 loc) • 23.8 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.parsePatch = parsePatch; /** * Parses a unified diff format patch into a structured patch object. * * `parsePatch` has some understanding of Git's particular dialect of unified diff format. * When parsing a Git patch, each index in the result may contain additional * fields (`isRename`, `isBinary`, etc) not included in the data structure returned by * `structuredPatch`; see the `StructuredPatch` interface for a full list. * * @return a JSON object representation of the patch, suitable for use with the `applyPatch` * method. This parses to the same structure returned by `structuredPatch`, except that * `oldFileName` and `newFileName` may be `undefined` if the patch doesn't contain enough * information to determine them (e.g. a hunk-only patch with no file headers). */ function parsePatch(uniDiff) { const diffstr = uniDiff.split(/\n/), list = []; let i = 0; // These helper functions identify line types that can appear between files // in a multi-file patch. Keeping them in one place avoids subtle // inconsistencies from having the same regexes duplicated in multiple places. // Matches `diff --git ...` lines specifically. function isGitDiffHeader(line) { return (/^diff --git /).test(line); } // Matches lines that denote the start of a new diff's section in a // multi-file patch: `diff --git ...`, `Index: ...`, or `diff -r ...`. function isDiffHeader(line) { return isGitDiffHeader(line) || (/^Index:\s/).test(line) || (/^diff(?: -r \w+)+\s/).test(line); } // Matches `--- ...` and `+++ ...` file header lines. function isFileHeader(line) { return (/^(---|\+\+\+)\s/).test(line); } // Matches `@@ ...` hunk header lines. function isHunkHeader(line) { return (/^@@\s/).test(line); } function parseIndex() { var _a; const index = {}; index.hunks = []; list.push(index); // Parse diff metadata let seenDiffHeader = false; while (i < diffstr.length) { const line = diffstr[i]; // File header (---, +++) or hunk header (@@) found; end parsing diff metadata if (isFileHeader(line) || isHunkHeader(line)) { break; } // The next two branches handle recognized diff headers. Note that // isDiffHeader deliberately does NOT match arbitrary `diff` // commands like `diff -u -p -r1.1 -r1.2`, because in some // formats (e.g. CVS diffs) such lines appear as metadata within // a single file's header section, after an `Index:` line. See the // diffx documentation (https://diffx.org) for examples. // // In both branches: if we've already seen a diff header for *this* // file and now we encounter another one, it must belong to the // next file, so break. if (isGitDiffHeader(line)) { if (seenDiffHeader) { return; } seenDiffHeader = true; index.isGit = true; // Parse the old and new filenames from the `diff --git` header and // tentatively set oldFileName and newFileName from them. These may // be overridden below by `rename from` / `rename to` or `copy from` / // `copy to` extended headers, or by --- and +++ lines. But for Git // diffs that lack all of those (e.g. mode-only changes, binary // file changes without rename), these are the only filenames we // get. // parseGitDiffHeader returns null if the header can't be parsed // (e.g. unterminated quoted filename, or unexpected format). In // that case we skip setting filenames here; they may still be // set from --- / +++ or rename from / rename to lines below. const paths = parseGitDiffHeader(line); if (paths) { index.oldFileName = paths.oldFileName; index.newFileName = paths.newFileName; } // Consume Git extended headers (`old mode`, `new mode`, `rename from`, // `rename to`, `similarity index`, `index`, `Binary files ... differ`, // etc.) i++; while (i < diffstr.length) { const extLine = diffstr[i]; // Stop consuming extended headers if we hit a file header, // hunk header, or another diff header. if (isFileHeader(extLine) || isHunkHeader(extLine) || isDiffHeader(extLine)) { break; } // Parse `rename from` / `rename to` lines - these give us // unambiguous filenames. These lines don't include the // a/ and b/ prefixes that appear in the `diff --git` header // and --- / +++ lines, so we add them for consistency. // Git C-style quotes filenames containing special characters // (tabs, newlines, backslashes, double quotes), so we must // unquote them when present. const renameFromMatch = (/^rename from (.*)/).exec(extLine); if (renameFromMatch) { index.oldFileName = 'a/' + unquoteIfQuoted(renameFromMatch[1]); index.isRename = true; } const renameToMatch = (/^rename to (.*)/).exec(extLine); if (renameToMatch) { index.newFileName = 'b/' + unquoteIfQuoted(renameToMatch[1]); index.isRename = true; } // Parse copy from / copy to lines similarly const copyFromMatch = (/^copy from (.*)/).exec(extLine); if (copyFromMatch) { index.oldFileName = 'a/' + unquoteIfQuoted(copyFromMatch[1]); index.isCopy = true; } const copyToMatch = (/^copy to (.*)/).exec(extLine); if (copyToMatch) { index.newFileName = 'b/' + unquoteIfQuoted(copyToMatch[1]); index.isCopy = true; } const newFileModeMatch = (/^new file mode (\d+)/).exec(extLine); if (newFileModeMatch) { index.isCreate = true; index.newMode = newFileModeMatch[1]; } const deletedFileModeMatch = (/^deleted file mode (\d+)/).exec(extLine); if (deletedFileModeMatch) { index.isDelete = true; index.oldMode = deletedFileModeMatch[1]; } const oldModeMatch = (/^old mode (\d+)/).exec(extLine); if (oldModeMatch) { index.oldMode = oldModeMatch[1]; } const newModeMatch = (/^new mode (\d+)/).exec(extLine); if (newModeMatch) { index.newMode = newModeMatch[1]; } if ((/^Binary files /).test(extLine)) { index.isBinary = true; } i++; } continue; } else if (isDiffHeader(line)) { if (seenDiffHeader) { return; } seenDiffHeader = true; // For Mercurial-style headers like // diff -r 9117c6561b0b -r 273ce12ad8f1 .hgignore // or Index: headers like // Index: something with multiple words // we extract the trailing filename as the index. // // TODO: It seems awkward that we indiscriminately trim off // trailing whitespace here. Theoretically, couldn't that // be meaningful - e.g. if the patch represents a diff of a // file whose name ends with a space? Seems wrong to nuke // it. But this behaviour has been around since v2.2.1 in // 2015, so if it's going to change, it should be done // cautiously and in a new major release, for // backwards-compat reasons. // -- ExplodingCabbage const headerMatch = (/^(?:Index:|diff(?: -r \w+)+)\s+/).exec(line); if (headerMatch) { index.index = line.substring(headerMatch[0].length).trim(); } } i++; } // Parse file headers if they are defined. Unified diff requires them, but // there's no technical issues to have an isolated hunk without file header parseFileHeader(index); parseFileHeader(index); // If we got one file header but not the other, that's a malformed patch. if ((index.oldFileName === undefined) !== (index.newFileName === undefined)) { throw new Error('Missing ' + (index.oldFileName !== undefined ? '"+++ ..."' : '"--- ..."') + ' file header for ' + ((_a = index.oldFileName) !== null && _a !== void 0 ? _a : index.newFileName)); } while (i < diffstr.length) { const line = diffstr[i]; if (isDiffHeader(line) || isFileHeader(line) || (/^===================================================================/).test(line)) { break; } else if (isHunkHeader(line)) { index.hunks.push(parseHunk()); } else { // Skip blank lines and any other unrecognized content between // or after hunks. Real-world examples of such content include: // - `Only in <dir>: <file>` from GNU `diff -r` // - `Property changes on:` sections from `svn diff` // - Trailing prose or commentary in email patches // GNU `patch` tolerates all of these, and so do we. i++; } } } /** * Parses the old and new filenames from a `diff --git` header line. * * The format is: * diff --git a/<old-path> b/<new-path> * * When filenames contain special characters (including newlines, tabs, * backslashes, or double quotes), Git quotes them with C-style escaping: * diff --git "a/file\twith\ttabs.txt" "b/file\twith\ttabs.txt" * * When filenames don't contain special characters and the old and new names * are the same, we can unambiguously split on ` b/` by finding where the * two halves (including their a/ and b/ prefixes) yield matching bare names. * * A pathological case exists in which we cannot reliably determine the paths * from the `diff --git` header. This case is when the following are true: * - the old and new file paths differ * - they are both unquoted (i.e. contain no special characters) * - at least one of the underlying file paths includes the substring ` b/` * In this scenario, we do not know which occurrence of ` b/` indicates the * start of the new file path, so the header is inherently ambiguous. We thus * select a possible interpretation arbitrarily and return that. * * Fortunately, this ambiguity should never matter, because in any patch * genuinely output by Git in which this pathological scenario occurs, there * must also be `rename from`/`rename to` or `copy from`/`copy to` extended * headers present below the `diff --git` header. `parseIndex` will parse * THOSE headers, from which we CAN unambiguously determine the filenames, * and will discard the result returned by this function. * * Returns null if the header can't be parsed at all — e.g. a quoted filename * has an unterminated quote, or if the unquoted header doesn't match the * expected `a/... b/...` format. In that case, the caller (parseIndex) * skips setting oldFileName/newFileName from this header, but they may * still be set later from `---`/`+++` lines or `rename from`/`rename to` * extended headers; if none of those are present either, they'll remain * undefined in the output. */ function parseGitDiffHeader(line) { // Strip the "diff --git " prefix const rest = line.substring('diff --git '.length); // Handle quoted paths: "a/path" "b/path" // Git quotes paths when they contain characters like newlines, tabs, // backslashes, or double quotes (but notably not spaces). if (rest.startsWith('"')) { const oldPath = parseQuotedFileName(rest); if (oldPath === null) { return null; } const afterOld = rest.substring(oldPath.rawLength + 1); // +1 for space let newFileName; if (afterOld.startsWith('"')) { const newPath = parseQuotedFileName(afterOld); if (newPath === null) { return null; } newFileName = newPath.fileName; } else { newFileName = afterOld; } return { oldFileName: oldPath.fileName, newFileName }; } // Check if the second path is quoted // e.g. diff --git a/simple "b/renamed\nnewline.txt" const quoteIdx = rest.indexOf('"'); if (quoteIdx > 0) { const oldFileName = rest.substring(0, quoteIdx - 1); const newPath = parseQuotedFileName(rest.substring(quoteIdx)); if (newPath === null) { return null; } return { oldFileName, newFileName: newPath.fileName }; } // Unquoted paths. Try to find the split point. // The format is: a/<old-path> b/<new-path> // // Note the potential ambiguity caused by the possibility of the file paths // themselves containing the substring ` b/`, plus the pathological case // described in the comment above. // // Strategy: find all occurrences of " b/" and split on the middle // one. When old and new names are the same (which is the only case where // we can't rely on extended headers later in the patch so HAVE to get // this right), this will always be the correct split. if (rest.startsWith('a/')) { const splits = []; let idx = 0; while (true) { idx = rest.indexOf(' b/', idx + 1); if (idx === -1) { break; } splits.push(idx); } if (splits.length > 0) { const mid = splits[Math.floor(splits.length / 2)]; return { oldFileName: rest.substring(0, mid), newFileName: rest.substring(mid + 1) }; } } // Fallback: can't parse, return null return null; } /** * If `s` starts with a double quote, unquotes it using C-style escape * rules (as used by Git). Otherwise returns `s` as-is. */ function unquoteIfQuoted(s) { if (s.startsWith('"')) { const parsed = parseQuotedFileName(s); if (parsed) { return parsed.fileName; } } return s; } /** * Parses a C-style quoted filename as used by Git or GNU `diff -u`. * Returns the unescaped filename and the raw length consumed (including quotes). */ function parseQuotedFileName(s) { if (!s.startsWith('"')) { return null; } let result = ''; let j = 1; // skip opening quote while (j < s.length) { if (s[j] === '"') { return { fileName: result, rawLength: j + 1 }; } if (s[j] === '\\' && j + 1 < s.length) { j++; switch (s[j]) { case 'a': result += '\x07'; break; case 'b': result += '\b'; break; case 'f': result += '\f'; break; case 'n': result += '\n'; break; case 'r': result += '\r'; break; case 't': result += '\t'; break; case 'v': result += '\v'; break; case '\\': result += '\\'; break; case '"': result += '"'; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // C-style octal escapes represent raw bytes. Collect // consecutive octal-escaped bytes and decode as UTF-8. // Validate that we have a full 3-digit octal escape if (j + 2 >= s.length || s[j + 1] < '0' || s[j + 1] > '7' || s[j + 2] < '0' || s[j + 2] > '7') { return null; } const bytes = [parseInt(s.substring(j, j + 3), 8)]; j += 3; while (s[j] === '\\' && s[j + 1] >= '0' && s[j + 1] <= '7') { if (j + 3 >= s.length || s[j + 2] < '0' || s[j + 2] > '7' || s[j + 3] < '0' || s[j + 3] > '7') { return null; } bytes.push(parseInt(s.substring(j + 1, j + 4), 8)); j += 4; } result += new TextDecoder('utf-8').decode(new Uint8Array(bytes)); continue; // j already points at the next character } // Note that in C, there are also three kinds of hex escape sequences: // - \xhh // - \uhhhh // - \Uhhhhhhhh // We do not bother to parse them here because, so far as we know, // they are never emitted by any tools that generate unified diff // format diffs, and so for now jsdiff does not consider them legal. default: return null; } } else { result += s[j]; } j++; } // Unterminated quote return null; } // Parses the --- and +++ headers, if none are found, no lines // are consumed. function parseFileHeader(index) { const fileHeaderMatch = (/^(---|\+\+\+)\s+/).exec(diffstr[i]); if (fileHeaderMatch) { const prefix = fileHeaderMatch[1], data = diffstr[i].substring(3).trim().split('\t', 2), header = (data[1] || '').trim(); let fileName = data[0]; if (fileName.startsWith('"')) { fileName = unquoteIfQuoted(fileName); } else { fileName = fileName.replace(/\\\\/g, '\\'); } if (prefix === '---') { index.oldFileName = fileName; index.oldHeader = header; } else { index.newFileName = fileName; index.newHeader = header; } i++; } } // Parses a hunk // This assumes that we are at the start of a hunk. function parseHunk() { var _a; const chunkHeaderIndex = i, chunkHeaderLine = diffstr[i++], chunkHeader = chunkHeaderLine.split(/@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@/); const hunk = { oldStart: +chunkHeader[1], oldLines: typeof chunkHeader[2] === 'undefined' ? 1 : +chunkHeader[2], newStart: +chunkHeader[3], newLines: typeof chunkHeader[4] === 'undefined' ? 1 : +chunkHeader[4], lines: [] }; // Unified Diff Format quirk: If the chunk size is 0, // the first number is one lower than one would expect. // https://www.artima.com/weblogs/viewpost.jsp?thread=164293 if (hunk.oldLines === 0) { hunk.oldStart += 1; } if (hunk.newLines === 0) { hunk.newStart += 1; } let addCount = 0, removeCount = 0; for (; i < diffstr.length && (removeCount < hunk.oldLines || addCount < hunk.newLines || ((_a = diffstr[i]) === null || _a === void 0 ? void 0 : _a.startsWith('\\'))); i++) { const operation = (diffstr[i].length == 0 && i != (diffstr.length - 1)) ? ' ' : diffstr[i][0]; if (operation === '+' || operation === '-' || operation === ' ' || operation === '\\') { hunk.lines.push(diffstr[i]); if (operation === '+') { addCount++; } else if (operation === '-') { removeCount++; } else if (operation === ' ') { addCount++; removeCount++; } } else { throw new Error(`Hunk at line ${chunkHeaderIndex + 1} contained invalid line ${diffstr[i]}`); } } // Handle the empty block count case if (!addCount && hunk.newLines === 1) { hunk.newLines = 0; } if (!removeCount && hunk.oldLines === 1) { hunk.oldLines = 0; } // Perform sanity checking if (addCount !== hunk.newLines) { throw new Error('Added line count did not match for hunk at line ' + (chunkHeaderIndex + 1)); } if (removeCount !== hunk.oldLines) { throw new Error('Removed line count did not match for hunk at line ' + (chunkHeaderIndex + 1)); } // Check for extra hunk-body-like lines after the declared line counts // were exhausted. If the very next line starts with ' ', '+', or '-', // the hunk's line counts were probably wrong — unless it's a file // header (--- or +++), which legitimately appears immediately after a // hunk in multi-file diffs without Index lines. if (i < diffstr.length && diffstr[i] && (/^[+ -]/).test(diffstr[i]) && !isFileHeader(diffstr[i])) { throw new Error('Hunk at line ' + (chunkHeaderIndex + 1) + ' has more lines than expected (expected ' + hunk.oldLines + ' old lines and ' + hunk.newLines + ' new lines)'); } return hunk; } while (i < diffstr.length) { parseIndex(); } return list; }