skimr

<!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta name="generator" content="pandoc" /> <meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <title>Regular expressions</title> <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); </script> <style type="text/css"> code{white-space: pre-wrap;} span.smallcaps{font-variant: small-caps;} span.underline{text-decoration: underline;} div.column{display: inline-block; vertical-align: top; width: 50%;} div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} ul.task-list{list-style: none;} </style> <style type="text/css"> code { white-space: pre; } .sourceCode { overflow: visible; } </style> <style type="text/css" data-origin="pandoc"> pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } .sourceCode { overflow: visible; } code.sourceCode > span { color: inherit; text-decoration: inherit; } div.sourceCode { margin: 1em 0; } pre.sourceCode { margin: 0; } @media screen { div.sourceCode { overflow: auto; } } @media print { pre > code.sourceCode { white-space: pre-wrap; } pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } } pre.numberSource code { counter-reset: source-line 0; } pre.numberSource code > span { position: relative; left: -4em; counter-increment: source-line; } pre.numberSource code > span > a:first-child::before { content: counter(source-line); position: relative; left: -1em; text-align: right; vertical-align: baseline; border: none; display: inline-block; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; color: #aaaaaa; } pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } code span.al { color: #ff0000; font-weight: bold; } code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } code span.at { color: #7d9029; } code span.bn { color: #40a070; } code span.bu { color: #008000; } code span.cf { color: #007020; font-weight: bold; } code span.ch { color: #4070a0; } code span.cn { color: #880000; } code span.co { color: #60a0b0; font-style: italic; } code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } code span.do { color: #ba2121; font-style: italic; } code span.dt { color: #902000; } code span.dv { color: #40a070; } code span.er { color: #ff0000; font-weight: bold; } code span.ex { } code span.fl { color: #40a070; } code span.fu { color: #06287e; } code span.im { color: #008000; font-weight: bold; } code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } code span.kw { color: #007020; font-weight: bold; } code span.op { color: #666666; } code span.ot { color: #007020; } code span.pp { color: #bc7a00; } code span.sc { color: #4070a0; } code span.ss { color: #bb6688; } code span.st { color: #4070a0; } code span.va { color: #19177c; } code span.vs { color: #4070a0; } code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } </style> <script> // apply pandoc div.sourceCode style to pre.sourceCode instead (function() { var sheets = document.styleSheets; for (var i = 0; i < sheets.length; i++) { if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue; try { var rules = sheets[i].cssRules; } catch (e) { continue; } var j = 0; while (j < rules.length) { var rule = rules[j]; // check if there is a div.sourceCode rule if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") { j++; continue; } var style = rule.style.cssText; // check if color or background-color is set if (rule.style.color === '' && rule.style.backgroundColor === '') { j++; continue; } // replace div.sourceCode by a pre.sourceCode rule sheets[i].deleteRule(j); sheets[i].insertRule('pre.sourceCode{' + style + '}', j); } } })(); </script> <style type="text/css">body { background-color: #fff; margin: 1em auto; max-width: 700px; overflow: visible; padding-left: 2em; padding-right: 2em; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; font-size: 14px; line-height: 1.35; } #TOC { clear: both; margin: 0 0 10px 10px; padding: 4px; width: 400px; border: 1px solid #CCCCCC; border-radius: 5px; background-color: #f6f6f6; font-size: 13px; line-height: 1.3; } #TOC .toctitle { font-weight: bold; font-size: 15px; margin-left: 5px; } #TOC ul { padding-left: 40px; margin-left: -1.5em; margin-top: 5px; margin-bottom: 5px; } #TOC ul ul { margin-left: -2em; } #TOC li { line-height: 16px; } table { margin: 1em auto; border-width: 1px; border-color: #DDDDDD; border-style: outset; border-collapse: collapse; } table th { border-width: 2px; padding: 5px; border-style: inset; } table td { border-width: 1px; border-style: inset; line-height: 18px; padding: 5px 5px; } table, table th, table td { border-left-style: none; border-right-style: none; } table thead, table tr.even { background-color: #f7f7f7; } p { margin: 0.5em 0; } blockquote { background-color: #f6f6f6; padding: 0.25em 0.75em; } hr { border-style: solid; border: none; border-top: 1px solid #777; margin: 28px 0; } dl { margin-left: 0; } dl dd { margin-bottom: 13px; margin-left: 13px; } dl dt { font-weight: bold; } ul { margin-top: 0; } ul li { list-style: circle outside; } ul ul { margin-bottom: 0; } pre, code { background-color: #f7f7f7; border-radius: 3px; color: #333; white-space: pre-wrap; } pre { border-radius: 3px; margin: 5px 0px 10px 0px; padding: 10px; } pre:not([class]) { background-color: #f7f7f7; } code { font-family: Consolas, Monaco, 'Courier New', monospace; font-size: 85%; } p > code, li > code { padding: 2px 0px; } div.figure { text-align: center; } img { background-color: #FFFFFF; padding: 2px; border: 1px solid #DDDDDD; border-radius: 3px; border: 1px solid #CCCCCC; margin: 0 5px; } h1 { margin-top: 0; font-size: 35px; line-height: 40px; } h2 { border-bottom: 4px solid #f7f7f7; padding-top: 10px; padding-bottom: 2px; font-size: 145%; } h3 { border-bottom: 2px solid #f7f7f7; padding-top: 10px; font-size: 120%; } h4 { border-bottom: 1px solid #f7f7f7; margin-left: 8px; font-size: 105%; } h5, h6 { border-bottom: 1px solid #ccc; font-size: 105%; } a { color: #0033dd; text-decoration: none; } a:hover { color: #6666ff; } a:visited { color: #800080; } a:visited:hover { color: #BB00BB; } a[href^="http:"] { text-decoration: underline; } a[href^="https:"] { text-decoration: underline; } code > span.kw { color: #555; font-weight: bold; } code > span.dt { color: #902000; } code > span.dv { color: #40a070; } code > span.bn { color: #d14; } code > span.fl { color: #d14; } code > span.ch { color: #d14; } code > span.st { color: #d14; } code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } </style> </head> <body> <h1 class="title toc-ignore">Regular expressions</h1> Regular expressions are a concise and flexible tool for describing patterns in strings. This vignette describes the key features of stringr’s regular expressions, as implemented by <a href="https://github.com/gagolews/stringi">stringi</a>. It is not a tutorial, so if you’re unfamiliar regular expressions, I’d recommend starting at <a href="https://r4ds.had.co.nz/strings.html" class="uri">https://r4ds.had.co.nz/strings.html</a>. If you want to master the details, I’d recommend reading the classic <a href="https://www.amazon.com/Mastering-Regular-Expressions-Jeffrey-Friedl/dp/0596528124">Mastering Regular Expressions</a> by Jeffrey E. F. Friedl. Regular expressions are the default pattern engine in stringr. That means when you use a pattern matching function with a bare string, it’s equivalent to wrapping it in a call to <code>regex()</code>: <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a># The regular call: <a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>str_extract(fruit, "nana") <a href="#cb1-3" aria-hidden="true" tabindex="-1"></a># Is shorthand for <a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>str_extract(fruit, regex("nana"))</code></pre></div> You will need to use <code>regex()</code> explicitly if you want to override the default options, as you’ll see in examples below. <div id="basic-matches" class="section level2"> <h2>Basic matches</h2> The simplest patterns match exact strings: <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>x <- c("apple", "banana", "pear") <a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>str_extract(x, "an") <a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>#> [1] NA "an" NA</code></pre></div> You can perform a case-insensitive match using <code>ignore_case = TRUE</code>: <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>bananas <- c("banana", "Banana", "BANANA") <a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>str_detect(bananas, "banana") <a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE FALSE FALSE <a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>str_detect(bananas, regex("banana", ignore_case = TRUE)) <a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE TRUE TRUE</code></pre></div> The next step up in complexity is <code>.</code>, which matches any character except a newline: <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>str_extract(x, ".a.") <a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>#> [1] NA "ban" "ear"</code></pre></div> You can allow <code>.</code> to match everything, including <code>\n</code>, by setting <code>dotall = TRUE</code>: <div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>str_detect("\nX\n", ".X.") <a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>#> [1] FALSE <a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>str_detect("\nX\n", regex(".X.", dotall = TRUE)) <a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE</code></pre></div> </div> <div id="escaping" class="section level2"> <h2>Escaping</h2> If “<code>.</code>” matches any character, how do you match a literal “<code>.</code>”? You need to use an “escape” to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, <code>\</code>, to escape special behaviour. So to match an <code>.</code>, you need the regexp <code>\.</code>. Unfortunately this creates a problem. We use strings to represent regular expressions, and <code>\</code> is also used as an escape symbol in strings. So to create the regular expression <code>\.</code> we need the string <code>"\\."</code>. <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a># To create the regular expression, we need \\ <a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>dot <- "\\." <a href="#cb6-3" aria-hidden="true" tabindex="-1"></a> <a href="#cb6-4" aria-hidden="true" tabindex="-1"></a># But the expression itself only contains one: <a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>writeLines(dot) <a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>#> \. <a href="#cb6-7" aria-hidden="true" tabindex="-1"></a> <a href="#cb6-8" aria-hidden="true" tabindex="-1"></a># And this tells R to look for an explicit . <a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>str_extract(c("abc", "a.c", "bef"), "a\\.c") <a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>#> [1] NA "a.c" NA</code></pre></div> If <code>\</code> is used as an escape character in regular expressions, how do you match a literal <code>\</code>? Well you need to escape it, creating the regular expression <code>\\</code>. To create that regular expression, you need to use a string, which also needs to escape <code>\</code>. That means to match a literal <code>\</code> you need to write <code>"\\\\"</code> — you need four backslashes to match one! <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>x <- "a\\b" <a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>writeLines(x) <a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>#> a\b <a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> <a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>str_extract(x, "\\\\") <a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>#> [1] "\\"</code></pre></div> In this vignette, I use <code>\.</code> to denote the regular expression, and <code>"\\."</code> to denote the string that represents the regular expression. An alternative quoting mechanism is <code>\Q...\E</code>: all the characters in <code>...</code> are treated as exact matches. This is useful if you want to exactly match user input as part of a regular expression. <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>x <- c("a.b.c.d", "aeb") <a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>starts_with <- "a.b" <a href="#cb8-3" aria-hidden="true" tabindex="-1"></a> <a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>str_detect(x, paste0("^", starts_with)) <a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE TRUE <a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>str_detect(x, paste0("^\\Q", starts_with, "\\E")) <a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE FALSE</code></pre></div> </div> <div id="special-characters" class="section level2"> <h2>Special characters</h2> Escapes also allow you to specify individual characters that are otherwise hard to type. You can specify individual unicode characters in five ways, either as a variable number of hex digits (four is most common), or by name: <ul> <li><code>\xhh</code>: 2 hex digits.</li> <li><code>\x{hhhh}</code>: 1-6 hex digits.</li> <li><code>\uhhhh</code>: 4 hex digits.</li> <li><code>\Uhhhhhhhh</code>: 8 hex digits.</li> <li><code>\N{name}</code>, e.g. <code>\N{grinning face}</code> matches the basic smiling emoji.</li> </ul> Similarly, you can specify many common control characters: <ul> <li><code>\a</code>: bell.</li> <li><code>\cX</code>: match a control-X character.</li> <li><code>\e</code>: escape (<code>\u001B</code>).</li> <li><code>\f</code>: form feed (<code>\u000C</code>).</li> <li><code>\n</code>: line feed (<code>\u000A</code>).</li> <li><code>\r</code>: carriage return (<code>\u000D</code>).</li> <li><code>\t</code>: horizontal tabulation (<code>\u0009</code>).</li> <li><code>\0ooo</code> match an octal character. ‘ooo’ is from one to three octal digits, from 000 to 0377. The leading zero is required.</li> </ul> (Many of these are only of historical interest and are only included here for the sake of completeness.) </div> <div id="matching-multiple-characters" class="section level2"> <h2>Matching multiple characters</h2> There are a number of patterns that match more than one character. You’ve already seen <code>.</code>, which matches any character (except a newline). A closely related operator is <code>\X</code>, which matches a grapheme cluster, a set of individual elements that form a single symbol. For example, one way of representing “á” is as the letter “a” plus an accent: <code>.</code> will match the component “a”, while <code>\X</code> will match the complete symbol: <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>x <- "a\u0301" <a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>str_extract(x, ".") <a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>#> [1] "a" <a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>str_extract(x, "\\X") <a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>#> [1] "á"</code></pre></div> There are five other escaped pairs that match narrower classes of characters: <ul> <li><code>\d</code>: matches any digit. The complement, <code>\D</code>, matches any character that is not a decimal digit. <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>str_extract_all("1 + 2 = 3", "\\d+")[[1]] <a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>#> [1] "1" "2" "3"</code></pre></div> Technically, <code>\d</code> includes any character in the Unicode Category of Nd (“Number, Decimal Digit”), which also includes numeric symbols from other languages: <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a># Some Laotian numbers <a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>str_detect("១២៣", "\\d") <a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE</code></pre></div></li> <li><code>\s</code>: matches any whitespace. This includes tabs, newlines, form feeds, and any character in the Unicode Z Category (which includes a variety of space characters and other separators.). The complement, <code>\S</code>, matches any non-whitespace character. <div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>(text <- "Some \t badly\n\t\tspaced \f text") <a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>#> [1] "Some \t badly\n\t\tspaced \f text" <a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>str_replace_all(text, "\\s+", " ") <a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>#> [1] "Some badly spaced text"</code></pre></div></li> <li><code>\p{property name}</code> matches any character with specific unicode property, like <code>\p{Uppercase}</code> or <code>\p{Diacritic}</code>. The complement, <code>\P{property name}</code>, matches all characters without the property. A complete list of unicode properties can be found at <a href="http://www.unicode.org/reports/tr44/#Property_Index" class="uri">http://www.unicode.org/reports/tr44/#Property_Index</a>. <div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>(text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”")) <a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>#> [1] "\"Double quotes\"" "«Guillemet»" "“Fancy quotes”" <a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>str_replace_all(text, "\\p{quotation mark}", "'") <a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>#> [1] "'Double quotes'" "'Guillemet'" "'Fancy quotes'"</code></pre></div></li> <li><code>\w</code> matches any “word” character, which includes alphabetic characters, marks and decimal numbers. The complement, <code>\W</code>, matches any non-word character. <div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>str_extract_all("Don't eat that!", "\\w+")[[1]] <a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>#> [1] "Don" "t" "eat" "that" <a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>str_split("Don't eat that!", "\\W")[[1]] <a href="#cb14-4" aria-hidden="true" tabindex="-1"></a>#> [1] "Don" "t" "eat" "that" ""</code></pre></div> Technically, <code>\w</code> also matches connector punctuation, <code>\u200c</code> (zero width connector), and <code>\u200d</code> (zero width joiner), but these are rarely seen in the wild.</li> <li><code>\b</code> matches word boundaries, the transition between word and non-word characters. <code>\B</code> matches the opposite: boundaries that have either both word or non-word characters on either side. <div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>str_replace_all("The quick brown fox", "\\b", "_") <a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>#> [1] "_The_ _quick_ _brown_ _fox_" <a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>str_replace_all("The quick brown fox", "\\B", "_") <a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>#> [1] "T_h_e q_u_i_c_k b_r_o_w_n f_o_x"</code></pre></div></li> </ul> You can also create your own character classes using <code>[]</code>: <ul> <li><code>[abc]</code>: matches a, b, or c.</li> <li><code>[a-z]</code>: matches every character between a and z (in Unicode code point order).</li> <li><code>[^abc]</code>: matches anything except a, b, or c.</li> <li><code>[\^\-]</code>: matches <code>^</code> or <code>-</code>.</li> </ul> There are a number of pre-built classes that you can use inside <code>[]</code>: <ul> <li><code>[:punct:]</code>: punctuation.</li> <li><code>[:alpha:]</code>: letters.</li> <li><code>[:lower:]</code>: lowercase letters.</li> <li><code>[:upper:]</code>: upperclass letters.</li> <li><code>[:digit:]</code>: digits.</li> <li><code>[:xdigit:]</code>: hex digits.</li> <li><code>[:alnum:]</code>: letters and numbers.</li> <li><code>[:cntrl:]</code>: control characters.</li> <li><code>[:graph:]</code>: letters, numbers, and punctuation.</li> <li><code>[:print:]</code>: letters, numbers, punctuation, and whitespace.</li> <li><code>[:space:]</code>: space characters (basically equivalent to <code>\s</code>).</li> <li><code>[:blank:]</code>: space and tab.</li> </ul> These all go inside the <code>[]</code> for character classes, i.e. <code>[[:digit:]AX]</code> matches all digits, A, and X. You can also using Unicode properties, like <code>[\p{Letter}]</code>, and various set operations, like <code>[\p{Letter}--\p{script=latin}]</code>. See <code>?"stringi-search-charclass"</code> for details. </div> <div id="alternation" class="section level2"> <h2>Alternation</h2> <code>|</code> is the alternation operator, which will pick between one or more possible matches. For example, <code>abc|def</code> will match <code>abc</code> or <code>def</code>: <div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>str_detect(c("abc", "def", "ghi"), "abc|def") <a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE TRUE FALSE</code></pre></div> Note that the precedence for <code>|</code> is low: <code>abc|def</code> is equivalent to <code>(abc)|(def)</code> not <code>ab(c|d)ef</code>. </div> <div id="grouping" class="section level2"> <h2>Grouping</h2> You can use parentheses to override the default precedence rules: <div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>str_extract(c("grey", "gray"), "gre|ay") <a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>#> [1] "gre" "ay" <a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>str_extract(c("grey", "gray"), "gr(e|a)y") <a href="#cb17-4" aria-hidden="true" tabindex="-1"></a>#> [1] "grey" "gray"</code></pre></div> Parenthesis also define “groups” that you can refer to with backreferences, like <code>\1</code>, <code>\2</code> etc, and can be extracted with <code>str_match()</code>. For example, the following regular expression finds all fruits that have a repeated pair of letters: <div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>pattern <- "(..)\\1" <a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>fruit %>% <a href="#cb18-3" aria-hidden="true" tabindex="-1"></a> str_subset(pattern) <a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>#> [1] "banana" <a href="#cb18-5" aria-hidden="true" tabindex="-1"></a> <a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>fruit %>% <a href="#cb18-7" aria-hidden="true" tabindex="-1"></a> str_subset(pattern) %>% <a href="#cb18-8" aria-hidden="true" tabindex="-1"></a> str_match(pattern) <a href="#cb18-9" aria-hidden="true" tabindex="-1"></a>#> [,1] [,2] <a href="#cb18-10" aria-hidden="true" tabindex="-1"></a>#> [1,] "anan" "an"</code></pre></div> You can use <code>(?:...)</code>, the non-grouping parentheses, to control precedence but not capture the match in a group. This is slightly more efficient than capturing parentheses. <div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>str_match(c("grey", "gray"), "gr(e|a)y") <a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>#> [,1] [,2] <a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>#> [1,] "grey" "e" <a href="#cb19-4" aria-hidden="true" tabindex="-1"></a>#> [2,] "gray" "a" <a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>str_match(c("grey", "gray"), "gr(?:e|a)y") <a href="#cb19-6" aria-hidden="true" tabindex="-1"></a>#> [,1] <a href="#cb19-7" aria-hidden="true" tabindex="-1"></a>#> [1,] "grey" <a href="#cb19-8" aria-hidden="true" tabindex="-1"></a>#> [2,] "gray"</code></pre></div> This is most useful for more complex cases where you need to capture matches and control precedence independently. </div> <div id="anchors" class="section level2"> <h2>Anchors</h2> By default, regular expressions will match any part of a string. It’s often useful to anchor the regular expression so that it matches from the start or end of the string: <ul> <li><code>^</code> matches the start of string.</li> <li><code>$</code> matches the end of the string.</li> </ul> <div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>x <- c("apple", "banana", "pear") <a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>str_extract(x, "^a") <a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>#> [1] "a" NA NA <a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>str_extract(x, "a$") <a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>#> [1] NA "a" NA</code></pre></div> To match a literal “$” or “^”, you need to escape them, <code>\$</code>, and <code>\^</code>. For multiline strings, you can use <code>regex(multiline = TRUE)</code>. This changes the behaviour of <code>^</code> and <code>$</code>, and introduces three new operators: <ul> <li><code>^</code> now matches the start of each line.</li> <li><code>$</code> now matches the end of each line.</li> <li><code>\A</code> matches the start of the input.</li> <li><code>\z</code> matches the end of the input.</li> <li><code>\Z</code> matches the end of the input, but before the final line terminator, if it exists.</li> </ul> <div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>x <- "Line 1\nLine 2\nLine 3\n" <a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>str_extract_all(x, "^Line..")[[1]] <a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>#> [1] "Line 1" <a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]] <a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>#> [1] "Line 1" "Line 2" "Line 3" <a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]] <a href="#cb21-7" aria-hidden="true" tabindex="-1"></a>#> [1] "Line 1"</code></pre></div> </div> <div id="repetition" class="section level2"> <h2>Repetition</h2> You can control how many times a pattern matches with the repetition operators: <ul> <li><code>?</code>: 0 or 1.</li> <li><code>+</code>: 1 or more.</li> <li><code>*</code>: 0 or more.</li> </ul> <div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" <a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>str_extract(x, "CC?") <a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>#> [1] "CC" <a href="#cb22-4" aria-hidden="true" tabindex="-1"></a>str_extract(x, "CC+") <a href="#cb22-5" aria-hidden="true" tabindex="-1"></a>#> [1] "CCC" <a href="#cb22-6" aria-hidden="true" tabindex="-1"></a>str_extract(x, 'C[LX]+') <a href="#cb22-7" aria-hidden="true" tabindex="-1"></a>#> [1] "CLXXX"</code></pre></div> Note that the precedence of these operators is high, so you can write: <code>colou?r</code> to match either American or British spellings. That means most uses will need parentheses, like <code>bana(na)+</code>. You can also specify the number of matches precisely: <ul> <li><code>{n}</code>: exactly n</li> <li><code>{n,}</code>: n or more</li> <li><code>{n,m}</code>: between n and m</li> </ul> <div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>str_extract(x, "C{2}") <a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>#> [1] "CC" <a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>str_extract(x, "C{2,}") <a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>#> [1] "CCC" <a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>str_extract(x, "C{2,3}") <a href="#cb23-6" aria-hidden="true" tabindex="-1"></a>#> [1] "CCC"</code></pre></div> By default these matches are “greedy”: they will match the longest string possible. You can make them “lazy”, matching the shortest string possible by putting a <code>?</code> after them: <ul> <li><code>??</code>: 0 or 1, prefer 0.</li> <li><code>+?</code>: 1 or more, match as few times as possible.</li> <li><code>*?</code>: 0 or more, match as few times as possible.</li> <li><code>{n,}?</code>: n or more, match as few times as possible.</li> <li><code>{n,m}?</code>: between n and m, , match as few times as possible, but at least n.</li> </ul> <div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>str_extract(x, c("C{2,3}", "C{2,3}?")) <a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>#> [1] "CCC" "CC" <a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>str_extract(x, c("C[LX]+", "C[LX]+?")) <a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>#> [1] "CLXXX" "CL"</code></pre></div> You can also make the matches possessive by putting a <code>+</code> after them, which means that if later parts of the match fail, the repetition will not be re-tried with a smaller number of characters. This is an advanced feature used to improve performance in worst-case scenarios (called “catastrophic backtracking”). <ul> <li><code>?+</code>: 0 or 1, possessive.</li> <li><code>++</code>: 1 or more, possessive.</li> <li><code>*+</code>: 0 or more, possessive.</li> <li><code>{n}+</code>: exactly n, possessive.</li> <li><code>{n,}+</code>: n or more, possessive.</li> <li><code>{n,m}+</code>: between n and m, possessive.</li> </ul> A related concept is the atomic-match parenthesis, <code>(?>...)</code>. If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole. Compare the following two regular expressions: <div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>str_detect("ABC", "(?>A|.B)C") <a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>#> [1] FALSE <a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>str_detect("ABC", "(?:A|.B)C") <a href="#cb25-4" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE</code></pre></div> The atomic match fails because it matches A, and then the next character is a C so it fails. The regular match succeeds because it matches A, but then C doesn’t match, so it back-tracks and tries B instead. </div> <div id="look-arounds" class="section level2"> <h2>Look arounds</h2> These assertions look ahead or behind the current match without “consuming” any characters (i.e. changing the input position). <ul> <li><code>(?=...)</code>: positive look-ahead assertion. Matches if <code>...</code> matches at the current input.</li> <li><code>(?!...)</code>: negative look-ahead assertion. Matches if <code>...</code> does not match at the current input.</li> <li><code>(?<=...)</code>: positive look-behind assertion. Matches if <code>...</code> matches text preceding the current position, with the last character of the match being the character just before the current position. Length must be bounded (i.e. no <code>*</code> or <code>+</code>).</li> <li><code>(?<!...)</code>: negative look-behind assertion. Matches if <code>...</code> does not match text preceding the current position. Length must be bounded (i.e. no <code>*</code> or <code>+</code>).</li> </ul> These are useful when you want to check that a pattern exists, but you don’t want to include it in the result: <div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>x <- c("1 piece", "2 pieces", "3") <a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>str_extract(x, "\\d+(?= pieces?)") <a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>#> [1] "1" "2" NA <a href="#cb26-4" aria-hidden="true" tabindex="-1"></a> <a href="#cb26-5" aria-hidden="true" tabindex="-1"></a>y <- c("100", "$400") <a href="#cb26-6" aria-hidden="true" tabindex="-1"></a>str_extract(y, "(?<=\\$)\\d+") <a href="#cb26-7" aria-hidden="true" tabindex="-1"></a>#> [1] NA "400"</code></pre></div> </div> <div id="comments" class="section level2"> <h2>Comments</h2> There are two ways to include comments in a regular expression. The first is with <code>(?#...)</code>: <div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>str_detect("xyz", "x(?#this is a comment)") <a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>#> [1] TRUE</code></pre></div> The second is to use <code>regex(comments = TRUE)</code>. This form ignores spaces and newlines, and anything everything after <code>#</code>. To match a literal space, you’ll need to escape it: <code>"\\ "</code>. This is a useful way of describing complex regular expressions: <div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>phone <- regex(" <a href="#cb28-2" aria-hidden="true" tabindex="-1"></a> \\(? # optional opening parens <a href="#cb28-3" aria-hidden="true" tabindex="-1"></a> (\\d{3}) # area code <a href="#cb28-4" aria-hidden="true" tabindex="-1"></a> \\)?