UNPKG

curlconverter

Version:

convert curl commands to Python, JavaScript, Go, PHP and more

737 lines (703 loc) 22.6 kB
import { Word, Token, firstShellToken } from "./Word.js"; import { CCError } from "../utils.js"; import { clip } from "../parse.js"; import parser from "./Parser.js"; import type { Parser } from "./Parser.js"; import { underlineNode, underlineCursor, type Warnings } from "../Warnings.js"; const BACKSLASHES = /\\./gs; function removeBackslash(m: string) { return m.charAt(1) === "\n" ? "" : m.charAt(1); } function removeBackslashes(str: string): string { return str.replace(BACKSLASHES, removeBackslash); } // https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes const DOUBLE_QUOTE_BACKSLASHES = /\\[\\$`"\n]/gs; function removeDoubleQuoteBackslashes(str: string): string { return str.replace(DOUBLE_QUOTE_BACKSLASHES, removeBackslash); } // ANSI-C quoted strings look $'like this'. // Not all shells have them but Bash does // https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html // // https://git.savannah.gnu.org/cgit/bash.git/tree/lib/sh/strtrans.c const ANSI_BACKSLASHES = /\\(\\|a|b|e|E|f|n|r|t|v|'|"|\?|[0-7]{1,3}|x[0-9A-Fa-f]{1,2}|u[0-9A-Fa-f]{1,4}|U[0-9A-Fa-f]{1,8}|c.)/gs; function removeAnsiCBackslashes(str: string): string { function unescapeChar(m: string) { switch (m.charAt(1)) { case "\\": return "\\"; case "a": return "\x07"; case "b": return "\b"; case "e": case "E": return "\x1B"; case "f": return "\f"; case "n": return "\n"; case "r": return "\r"; case "t": return "\t"; case "v": return "\v"; case "'": return "'"; case '"': return '"'; case "?": return "?"; case "c": // Bash handles all characters by considering the first byte // of its UTF-8 input and can produce invalid UTF-8, whereas // JavaScript stores strings in UTF-16 if (m.codePointAt(2)! > 127) { throw new CCError( 'non-ASCII control character in ANSI-C quoted string: "\\u{' + m.codePointAt(2)!.toString(16) + '}"', ); } // If this produces a 0x00 (null) character, it will cause bash to // terminate the string at that character, but we return the null // character in the result. return m[2] === "?" ? "\x7F" : String.fromCodePoint( m[2].toUpperCase().codePointAt(0)! & 0b00011111, ); case "x": case "u": case "U": // Hexadecimal character literal // Unlike bash, this will error if the the code point is greater than 10FFFF return String.fromCodePoint(parseInt(m.slice(2), 16)); case "0": case "1": case "2": case "3": case "4": case "5": case "6": case "7": // Octal character literal return String.fromCodePoint(parseInt(m.slice(1), 8) % 256); default: // There must be a mis-match between ANSI_BACKSLASHES and the switch statement throw new CCError( "unhandled character in ANSI-C escape code: " + JSON.stringify(m), ); } } return str.replace(ANSI_BACKSLASHES, unescapeChar); } function toTokens( node: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): Token[] { let vals: Token[] = []; switch (node.type) { case "$": // TODO: https://github.com/tree-sitter/tree-sitter-bash/issues/258 return ["$"]; case "word": case "number": // TODO: number might have a ${variable} return [removeBackslashes(node.text)]; case "raw_string": return [node.text.slice(1, -1)]; case "ansi_c_string": return [removeAnsiCBackslashes(node.text.slice(2, -1))]; case "string": case "translated_string": { // TODO: MISSING quotes, for example // curl "example.com let res = ""; for (const child of node.namedChildren) { if (child.type === "string_content") { res += removeDoubleQuoteBackslashes(child.text); } else { // expansion, simple_expansion or command_substitution (or concat?) const subVal = toTokens(child, curlCommand, warnings); if (typeof subVal === "string") { res += subVal; } else { if (res) { vals.push(res); res = ""; } vals = vals.concat(subVal); } } } if (res || vals.length === 0) { vals.push(res); } return vals; } case "simple_expansion": // TODO: handle variables downstream // '$' + variable_name or special_variable_name warnings.push([ "expansion", "found environment variable\n" + underlineNode(node, curlCommand), ]); if ( node.firstNamedChild && node.firstNamedChild.type === "special_variable_name" ) { // https://www.gnu.org/software/bash/manual/bash.html#Special-Parameters // TODO: warning isn't printed warnings.push([ "special_variable_name", node.text + " is a special Bash variable\n" + underlineNode(node.firstNamedChild, curlCommand), ]); } return [ { type: "variable", value: node.text.slice(1), text: node.text, syntaxNode: node, }, ]; case "expansion": // Expansions look ${like_this} // https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion // TODO: MISSING }, for example // curl example${com warnings.push([ "expansion", "found expansion expression\n" + underlineNode(node, curlCommand), ]); // variable_name or subscript or no child // TODO: handle substitutions return [ { type: "variable", value: node.text.slice(2, -1), text: node.text, syntaxNode: node, }, ]; case "command_substitution": // TODO: MISSING ), for example // curl example$(com warnings.push([ "expansion", "found command substitution expression\n" + underlineNode(node, curlCommand), ]); return [ { type: "command", // TODO: further tokenize and pass an array of args // to subprocess.run() or a command name + string args to C# value: node.text.slice(node.text.startsWith("$(") ? 2 : 1, -1), text: node.text, syntaxNode: node, }, ]; case "concatenation": { // item[]=1 turns into item=1 if we don't do this // https://github.com/tree-sitter/tree-sitter-bash/issues/104 let prevEnd = 0; let res = ""; for (const child of node.children) { // TODO: removeBackslashes()? // Can we get anything other than []{} characters here? res += node.text.slice(prevEnd, child.startIndex - node.startIndex); prevEnd = child.endIndex - node.startIndex; const subVal = toTokens(child, curlCommand, warnings); if (typeof subVal === "string") { res += subVal; } else { if (res) { vals.push(res); res = ""; } vals = vals.concat(subVal); } } res += node.text.slice(prevEnd); if (res || vals.length === 0) { vals.push(res); } return vals; } default: throw new CCError( "unexpected syntax node type " + JSON.stringify(node.type) + '. Must be one of "word", "number", "string", "raw_string", "ansi_c_string", "expansion", "simple_expansion", "translated_string", "command_substitution" or "concatenation"\n' + underlineNode(node, curlCommand), ); } } function toWord( node: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): Word { return new Word(toTokens(node, curlCommand, warnings)); } function* traverseLookingForBadNodes( tree: Parser.Tree, ): Generator<Parser.TreeCursor> { const cursor = tree.walk(); let reachedRoot = false; while (!reachedRoot) { if (cursor.nodeType === "ERROR" || cursor.nodeIsMissing) { yield cursor; } if (cursor.gotoFirstChild()) { continue; } if (cursor.gotoNextSibling()) { continue; } let retracing = true; while (retracing) { if (!cursor.gotoParent()) { retracing = false; reachedRoot = true; } if (cursor.gotoNextSibling()) { retracing = false; } } } } function warnAboutBadNodes( ast: Parser.Tree, curlCommand: string, warnings: Warnings, ) { const maxShown = 3; let count = 0; for (const badNode of traverseLookingForBadNodes(ast)) { if (count < maxShown) { let underlinedNode = ""; try { underlinedNode = ":\n" + underlineCursor(badNode, curlCommand); } catch {} const line = badNode.startPosition.row; const column = badNode.startPosition.column; warnings.push([ "bash", `Bash parsing error on line ${line + 1}` + (column !== 0 ? `, column ${column + 1}` : "") + underlinedNode, ]); } count += 1; } const extra = count - maxShown; if (extra > 0) { warnings.push([ "bash", `${extra} more Bash parsing error${extra > 1 ? "s" : ""} omitted`, ]); } } function warnAboutUselessBackslash( n: Parser.SyntaxNode, curlCommandLines: string[], warnings: Warnings, ) { const lastCommandLine = curlCommandLines[n.endPosition.row]; const impromperBackslash = lastCommandLine.match(/\\\s+$/); if ( impromperBackslash && curlCommandLines.length > n.endPosition.row + 1 && impromperBackslash.index !== undefined ) { warnings.push([ "unescaped-newline", "The trailling '\\' on line " + (n.endPosition.row + 1) + " is followed by whitespace, so it won't escape the newline after it:\n" + // TODO: cut off line if it's very long? lastCommandLine + "\n" + " ".repeat(impromperBackslash.index) + "^".repeat(impromperBackslash[0].length), ]); } } function extractRedirect( node: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): [Parser.SyntaxNode, Word?, Word?] { if (!node.namedChildCount) { throw new CCError('got empty "redirected_statement" AST node'); } let stdin, stdinFile; const [command, ...redirects] = node.namedChildren; if (command.type !== "command") { throw new CCError( 'got "redirected_statement" AST node whose first child is not a "command", got ' + command.type + " instead\n" + underlineNode(command, curlCommand), ); } if (node.childCount < 2) { throw new CCError( 'got "redirected_statement" AST node with only one child - no redirect', ); } if (redirects.length > 1) { warnings.push([ "multiple-redirects", // TODO: this is misleading because not all generators support redirects "found " + redirects.length + " redirect nodes. Only the first one will be used:\n" + underlineNode(redirects[1], curlCommand), ]); } const redirect = redirects[0]; if (redirect.type === "file_redirect") { const destination = redirect.childForFieldName("destination"); if (!destination) { throw new CCError( 'got "file_redirect" AST node with no "destination" child', ); } stdinFile = toWord(destination, curlCommand, warnings); } else if (redirect.type === "heredoc_redirect") { const heredocBody = redirect.descendantsOfType("heredoc_body")[0]; if (!heredocBody) { throw new CCError( 'got "redirected_statement" AST node without heredoc_body', ); } // TODO: heredocs can do variable expansion and stuff stdin = new Word(heredocBody.text); } else if (redirect.type === "herestring_redirect") { // TODO: herestring_redirect can be in a command node // https://github.com/tree-sitter/tree-sitter-bash/issues/260 if (redirect.namedChildCount < 1 || !redirect.firstNamedChild) { throw new CCError( 'got "redirected_statement" AST node with empty herestring', ); } stdin = toWord(redirect.firstNamedChild, curlCommand, warnings); } else { throw new CCError( 'got "redirected_statement" AST node whose second child is not one of "file_redirect", "heredoc_redirect" or "herestring_redirect", got ' + command.type + " instead", ); } return [command, stdin, stdinFile]; } function _findCurlInPipeline( node: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): [Parser.SyntaxNode?, Word?, Word?] { let command, stdin, stdinFile; for (const child of node.namedChildren) { if (child.type === "command") { const commandName = child.namedChildren[0]; if (commandName.type !== "command_name") { throw new CCError( 'got "command" AST node whose first child is not a "command_name", got ' + commandName.type + " instead\n" + underlineNode(commandName, curlCommand), ); } const commandNameWord = commandName.namedChildren[0]; if (commandNameWord.type !== "word") { throw new CCError( 'got "command_name" AST node whose first child is not a "word", got ' + commandNameWord.type + " instead\n" + underlineNode(commandNameWord, curlCommand), ); } if (commandNameWord.text === "curl") { if (!command) { command = child; } else { warnings.push([ "multiple-curl-in-pipeline", "found multiple curl commands in pipeline:\n" + underlineNode(child, curlCommand), ]); } } } else if (child.type === "redirected_statement") { const [redirCommand, redirStdin, redirStdinFile] = extractRedirect( child, curlCommand, warnings, ); if (redirCommand.namedChildren[0].text === "curl") { if (!command) { [command, stdin, stdinFile] = [ redirCommand, redirStdin, redirStdinFile, ]; } else { warnings.push([ "multiple-curl-in-pipeline", "found multiple curl commands in pipeline:\n" + underlineNode(redirCommand, curlCommand), ]); } } } else if (child.type === "pipeline") { // pipelines can be nested // https://github.com/tree-sitter/tree-sitter-bash/issues/167 const [nestedCommand, nestedStdin, nestedStdinFile] = _findCurlInPipeline( child, curlCommand, warnings, ); if (!nestedCommand) { continue; } if (nestedCommand.namedChildren[0].text === "curl") { if (!command) { [command, stdin, stdinFile] = [ nestedCommand, nestedStdin, nestedStdinFile, ]; } else { warnings.push([ "multiple-curl-in-pipeline", "found multiple curl commands in pipeline:\n" + underlineNode(nestedCommand, curlCommand), ]); } } } } return [command, stdin, stdinFile]; } // TODO: use pipeline input/output redirects, // i.e. add stdinCommand and stdout/stdoutFile/stdoutCommand function findCurlInPipeline( node: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): [Parser.SyntaxNode, Word?, Word?] { const [command, stdin, stdinFile] = _findCurlInPipeline( node, curlCommand, warnings, ); if (!command) { throw new CCError( "could not find curl command in pipeline\n" + underlineNode(node, curlCommand), ); } return [command, stdin, stdinFile]; } // TODO: check entire AST for ERROR/MISSING nodes // TODO: get all command nodes function extractCommandNodes( ast: Parser.Tree, curlCommand: string, warnings: Warnings, ): [Parser.SyntaxNode, Word?, Word?][] { // https://github.com/tree-sitter/tree-sitter-bash/blob/master/grammar.js // The AST must be in a nice format, i.e. // (program // (command // name: (command_name (word)) // argument+: ( // word | // "string" | // 'raw_string' | // $'ansi_c_string' | // $"translated_string" | // ${expansion} | // $simple_expansion | // concatenation))) // or // (program // (redirected_statement // body: (command, same as above) // redirect)) // Shouldn't happen. if (ast.rootNode.type !== "program") { // TODO: better error message. throw new CCError( // TODO: expand "AST" acronym the first time it appears in an error message 'expected a "program" top-level AST node, got ' + ast.rootNode.type + " instead", ); } if (ast.rootNode.namedChildCount < 1 || !ast.rootNode.namedChildren) { // TODO: better error message. throw new CCError('empty "program" node'); } const curlCommandLines = curlCommand.split("\n"); let sawComment = false; const commands: [Parser.SyntaxNode, Word?, Word?][] = []; // Get top-level command and redirected_statement AST nodes, skipping comments for (const n of ast.rootNode.namedChildren) { switch (n.type) { case "comment": sawComment = true; continue; case "command": commands.push([n, undefined, undefined]); warnAboutUselessBackslash(n, curlCommandLines, warnings); break; case "redirected_statement": commands.push(extractRedirect(n, curlCommand, warnings)); warnAboutUselessBackslash(n, curlCommandLines, warnings); break; case "pipeline": commands.push(findCurlInPipeline(n, curlCommand, warnings)); warnAboutUselessBackslash(n, curlCommandLines, warnings); break; case "heredoc_body": // https://github.com/tree-sitter/tree-sitter-bash/issues/118 continue; case "ERROR": throw new CCError( `Bash parsing error on line ${n.startPosition.row + 1}:\n` + underlineNode(n, curlCommand), ); default: // TODO: better error message. throw new CCError( "found " + JSON.stringify(n.type) + ' AST node, only "command", "pipeline" or "redirected_statement" are supported\n' + underlineNode(n, curlCommand), ); } } if (!commands.length) { // NOTE: if you add more node types in the `for` loop above, this error needs to be updated. // We would probably need to keep track of the node types we've seen. throw new CCError( 'expected a "command" or "redirected_statement" AST node' + (sawComment ? ', only found "comment" nodes' : ""), ); } return commands; } function toNameAndArgv( command: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): [Parser.SyntaxNode, Parser.SyntaxNode[]] { if (command.childCount < 1) { // TODO: better error message. throw new CCError( 'empty "command" node\n' + underlineNode(command, curlCommand), ); } // TODO: warn about leading variable_assignment s const name = command.childForFieldName("name"); const args = command.childrenForFieldName("argument"); // TODO: find if one changes stdin // const redirects = command.childrenForFieldName("redirect"); if (!name) { throw new CCError( 'found "command" AST node with no "command_name" child\n' + underlineNode(command, curlCommand), ); } return [name, args]; } // Checks that name is "curl" function nameToWord( name: Parser.SyntaxNode, curlCommand: string, warnings: Warnings, ): Word { if (name.childCount < 1 || !name.firstChild) { throw new CCError( 'found empty "command_name" AST node\n' + underlineNode(name, curlCommand), ); } else if (name.childCount > 1) { warnings.push([ "extra-command_name-children", 'expected "command_name" node to only have one child but it has ' + name.childCount, ]); } const nameNode = name.firstChild; const nameWord = toWord(nameNode, curlCommand, warnings); const nameWordStr = nameWord.toString(); const cmdNameShellToken = firstShellToken(nameWord); if (cmdNameShellToken) { // The most common reason for the command name to contain an expression // is probably users accidentally copying a $ from the shell prompt // without a space after it if (nameWordStr !== "$curl") { // TODO: or just assume it evaluates to "curl"? throw new CCError( "expected command name to be a simple value but found a " + cmdNameShellToken.type + "\n" + underlineNode(cmdNameShellToken.syntaxNode, curlCommand), ); } } else if (nameWordStr.trim() !== "curl") { const c = nameWordStr.trim(); if (!c) { throw new CCError( "found command without a command_name\n" + underlineNode(nameNode, curlCommand), ); } throw new CCError( 'command should begin with "curl" but instead begins with ' + JSON.stringify(clip(c)) + "\n" + underlineNode(nameNode, curlCommand), ); } return nameWord; } export function tokenize( curlCommand: string, warnings: Warnings = [], ): [Word[], Word?, Word?][] { const ast = parser.parse(curlCommand); warnAboutBadNodes(ast, curlCommand, warnings); // TODO: pass syntax nodes for each token downstream and use it to // highlight the problematic parts in warnings/errors so that it's clear // which command a warning/error is for // TODO: could even go further and pass all start/end indexes of all // values all the way through the conversion so when you mouse over a part of // the generated Python code we could highlight the part of the curl command // that it came from. const commandNodes = extractCommandNodes(ast, curlCommand, warnings); const commands: [Word[], Word?, Word?][] = []; for (const [command, stdin, stdinFile] of commandNodes) { const [name, argv] = toNameAndArgv(command, curlCommand, warnings); commands.push([ [ nameToWord(name, curlCommand, warnings), ...argv.map((arg) => toWord(arg, curlCommand, warnings)), ], stdin, stdinFile, ]); } return commands; }