skimr
Version:
CLI EDA for CSVs
773 lines (744 loc) • 88.4 kB
HTML
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="author" content="Sara Stoudt" />
<title>From base R</title>
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
var i, h, a;
for (i = 0; i < hs.length; i++) {
h = hs[i];
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
a = h.attributes;
while (a.length > 0) h.removeAttribute(a[0].name);
}
});
</script>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; }
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.at { color: #7d9029; }
code span.bn { color: #40a070; }
code span.bu { color: #008000; }
code span.cf { color: #007020; font-weight: bold; }
code span.ch { color: #4070a0; }
code span.cn { color: #880000; }
code span.co { color: #60a0b0; font-style: italic; }
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.do { color: #ba2121; font-style: italic; }
code span.dt { color: #902000; }
code span.dv { color: #40a070; }
code span.er { color: #ff0000; font-weight: bold; }
code span.ex { }
code span.fl { color: #40a070; }
code span.fu { color: #06287e; }
code span.im { color: #008000; font-weight: bold; }
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.kw { color: #007020; font-weight: bold; }
code span.op { color: #666666; }
code span.ot { color: #007020; }
code span.pp { color: #bc7a00; }
code span.sc { color: #4070a0; }
code span.ss { color: #bb6688; }
code span.st { color: #4070a0; }
code span.va { color: #19177c; }
code span.vs { color: #4070a0; }
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
var sheets = document.styleSheets;
for (var i = 0; i < sheets.length; i++) {
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
var j = 0;
while (j < rules.length) {
var rule = rules[j];
// check if there is a div.sourceCode rule
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
j++;
continue;
}
var style = rule.style.cssText;
// check if color or background-color is set
if (rule.style.color === '' && rule.style.backgroundColor === '') {
j++;
continue;
}
// replace div.sourceCode by a pre.sourceCode rule
sheets[i].deleteRule(j);
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
}
}
})();
</script>
<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap;
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }
code > span.kw { color: #555; font-weight: bold; }
code > span.dt { color: #902000; }
code > span.dv { color: #40a070; }
code > span.bn { color: #d14; }
code > span.fl { color: #d14; }
code > span.ch { color: #d14; }
code > span.st { color: #d14; }
code > span.co { color: #888888; font-style: italic; }
code > span.ot { color: #007020; }
code > span.al { color: #ff0000; font-weight: bold; }
code > span.fu { color: #900; font-weight: bold; }
code > span.er { color: #a61717; background-color: #e3d2d2; }
</style>
</head>
<body>
<h1 class="title toc-ignore">From base R</h1>
<h4 class="author">Sara Stoudt</h4>
<p>This vignette compares stringr functions to their base R equivalents
to help users transitioning from using base R to stringr.</p>
<div id="overall-differences" class="section level1">
<h1>Overall differences</h1>
<p>We’ll begin with a lookup table between the most important base
string functions and their stringr equivalents.</p>
<table>
<colgroup>
<col width="33%" />
<col width="66%" />
</colgroup>
<thead>
<tr class="header">
<th>base</th>
<th>stringr</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><code>gregexpr(pattern, x)</code></td>
<td><code>str_locate_all(x, pattern)</code></td>
</tr>
<tr class="even">
<td><code>grep(pattern, x, value = TRUE)</code></td>
<td><code>str_subset(x, pattern)</code></td>
</tr>
<tr class="odd">
<td><code>grep(pattern, x)</code></td>
<td><code>str_which(x, pattern)</code></td>
</tr>
<tr class="even">
<td><code>grepl(pattern, x)</code></td>
<td><code>str_detect(x, pattern)</code></td>
</tr>
<tr class="odd">
<td><code>gsub(pattern, replacement, x)</code></td>
<td><code>str_replace_all(x, pattern, replacement)</code></td>
</tr>
<tr class="even">
<td><code>nchar(x)</code></td>
<td><code>str_length(x)</code></td>
</tr>
<tr class="odd">
<td><code>order(x)</code></td>
<td><code>str_order(x)</code></td>
</tr>
<tr class="even">
<td><code>regexec(pattern, x)</code> + <code>regmatches()</code></td>
<td><code>str_match(x, pattern)</code></td>
</tr>
<tr class="odd">
<td><code>regexpr(pattern, x)</code> + <code>regmatches()</code></td>
<td><code>str_extract(x, pattern)</code></td>
</tr>
<tr class="even">
<td><code>regexpr(pattern, x)</code></td>
<td><code>str_locate(x, pattern)</code></td>
</tr>
<tr class="odd">
<td><code>sort(x)</code></td>
<td><code>str_sort(x)</code></td>
</tr>
<tr class="even">
<td><code>strrep(x, n)</code></td>
<td><code>str_dup(x, n)</code></td>
</tr>
<tr class="odd">
<td><code>strsplit(x, pattern)</code></td>
<td><code>str_split(x, pattern)</code></td>
</tr>
<tr class="even">
<td><code>strwrap(x)</code></td>
<td><code>str_wrap(x)</code></td>
</tr>
<tr class="odd">
<td><code>sub(pattern, replacement, x)</code></td>
<td><code>str_replace(x, pattern, replacement)</code></td>
</tr>
<tr class="even">
<td><code>substr(x, start, end)</code></td>
<td><code>str_sub(x, start, end)</code></td>
</tr>
<tr class="odd">
<td><code>tolower(x)</code></td>
<td><code>str_to_lower(x)</code></td>
</tr>
<tr class="even">
<td><code>tools::toTitleCase(x)</code></td>
<td><code>str_to_title(x)</code></td>
</tr>
<tr class="odd">
<td><code>toupper(x)</code></td>
<td><code>str_to_upper(x)</code></td>
</tr>
<tr class="even">
<td><code>trimws(x)</code></td>
<td><code>str_trim(x)</code></td>
</tr>
</tbody>
</table>
<p>Overall the main differences between base R and stringr are:</p>
<ol style="list-style-type: decimal">
<li><p>stringr functions start with <code>str_</code> prefix; base R
string functions have no consistent naming scheme.</p></li>
<li><p>The order of inputs is usually different between base R and
stringr. In base R, the <code>pattern</code> to match usually comes
first; in stringr, the <code>string</code> to manupulate always comes
first. This makes stringr easier to use in pipes, and with
<code>lapply()</code> or <code>purrr::map()</code>.</p></li>
<li><p>Functions in stringr tend to do less, where many of the string
processing functions in base R have multiple purposes.</p></li>
<li><p>The output and input of stringr functions has been carefully
designed. For example, the output of <code>str_locate()</code> can be
fed directly into <code>str_sub()</code>; the same is not true of
<code>regpexpr()</code> and <code>substr()</code>.</p></li>
<li><p>Base functions use arguments (like <code>perl</code>,
<code>fixed</code>, and <code>ignore.case</code>) to control how the
pattern is interpreted. To avoid dependence between arguments, stringr
instead uses helper functions (like <code>fixed()</code>,
<code>regexp()</code>, and <code>coll()</code>).</p></li>
</ol>
<p>Next we’ll walk through each of the functions, noting the
similarities and important differences. These examples are adapted from
the stringr documentation and here they are contrasted with the
analogous base R operations.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringr)</span></code></pre></div>
</div>
<div id="detect-matches" class="section level1">
<h1>Detect matches</h1>
<div id="str_detect-detect-the-presence-or-absence-of-a-pattern-in-a-string" class="section level2">
<h2><code>str_detect()</code>: Detect the presence or absence of a
pattern in a string</h2>
<p>Suppose you want to know whether each word in a vector of fruit names
contains an “a”.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>fruit <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"apple"</span>, <span class="st">"banana"</span>, <span class="st">"pear"</span>, <span class="st">"pineapple"</span>)</span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="fu">grepl</span>(<span class="at">pattern =</span> <span class="st">"a"</span>, <span class="at">x =</span> fruit)</span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] TRUE TRUE TRUE TRUE</span></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="fu">str_detect</span>(fruit, <span class="at">pattern =</span> <span class="st">"a"</span>)</span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] TRUE TRUE TRUE TRUE</span></span></code></pre></div>
<p>In base you would use <code>grepl()</code> (see the “l” and think
logical) while in stringr you use <code>str_detect()</code> (see the
verb “detect” and think of a yes/no action).</p>
</div>
<div id="str_which-find-positions-matching-a-pattern" class="section level2">
<h2><code>str_which()</code>: Find positions matching a pattern</h2>
<p>Now you want to identify the positions of the words in a vector of
fruit names that contain an “a”.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">grep</span>(<span class="at">pattern =</span> <span class="st">"a"</span>, <span class="at">x =</span> fruit)</span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 2 3 4</span></span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="fu">str_which</span>(fruit, <span class="at">pattern =</span> <span class="st">"a"</span>)</span>
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 2 3 4</span></span></code></pre></div>
<p>In base you would use <code>grep()</code> while in stringr you use
<code>str_which()</code> (by analogy to <code>which()</code>).</p>
</div>
<div id="str_count-count-the-number-of-matches-in-a-string" class="section level2">
<h2><code>str_count()</code>: Count the number of matches in a
string</h2>
<p>How many “a”s are in each fruit?</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># base </span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>loc <span class="ot"><-</span> <span class="fu">gregexpr</span>(<span class="at">pattern =</span> <span class="st">"a"</span>, <span class="at">text =</span> fruit, <span class="at">fixed =</span> <span class="cn">TRUE</span>)</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="fu">sapply</span>(loc, <span class="cf">function</span>(x) <span class="fu">length</span>(<span class="fu">attr</span>(x, <span class="st">"match.length"</span>)))</span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 3 1 1</span></span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="fu">str_count</span>(fruit, <span class="at">pattern =</span> <span class="st">"a"</span>)</span>
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 3 1 1</span></span></code></pre></div>
<p>This information can be gleaned from <code>gregexpr()</code> in base,
but you need to look at the <code>match.length</code> attribute as the
vector uses a length-1 integer vector (<code>-1</code>) to indicate no
match.</p>
</div>
<div id="str_locate-locate-the-position-of-patterns-in-a-string" class="section level2">
<h2><code>str_locate()</code>: Locate the position of patterns in a
string</h2>
<p>Within each fruit, where does the first “p” occur? Where are all of
the “p”s?</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>fruit3 <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"papaya"</span>, <span class="st">"lime"</span>, <span class="st">"apple"</span>)</span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">gregexpr</span>(<span class="at">pattern =</span> <span class="st">"p"</span>, <span class="at">text =</span> fruit3))</span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> List of 3</span></span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> $ : int [1:2] 1 3</span></span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "match.length")= int [1:2] 1 1</span></span>
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "index.type")= chr "chars"</span></span>
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "useBytes")= logi TRUE</span></span>
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> $ : int -1</span></span>
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "match.length")= int -1</span></span>
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "index.type")= chr "chars"</span></span>
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "useBytes")= logi TRUE</span></span>
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> $ : int [1:2] 2 3</span></span>
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "match.length")= int [1:2] 1 1</span></span>
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "index.type")= chr "chars"</span></span>
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> ..- attr(*, "useBytes")= logi TRUE</span></span>
<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a><span class="fu">str_locate</span>(fruit3, <span class="at">pattern =</span> <span class="st">"p"</span>)</span>
<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="co">#> start end</span></span>
<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1,] 1 1</span></span>
<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2,] NA NA</span></span>
<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a><span class="co">#> [3,] 2 2</span></span>
<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a><span class="fu">str_locate_all</span>(fruit3, <span class="at">pattern =</span> <span class="st">"p"</span>)</span>
<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a><span class="co">#> start end</span></span>
<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1,] 1 1</span></span>
<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2,] 3 3</span></span>
<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a><span class="co">#> start end</span></span>
<span id="cb5-33"><a href="#cb5-33" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb5-34"><a href="#cb5-34" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[3]]</span></span>
<span id="cb5-35"><a href="#cb5-35" aria-hidden="true" tabindex="-1"></a><span class="co">#> start end</span></span>
<span id="cb5-36"><a href="#cb5-36" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1,] 2 2</span></span>
<span id="cb5-37"><a href="#cb5-37" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2,] 3 3</span></span></code></pre></div>
</div>
</div>
<div id="subset-strings" class="section level1">
<h1>Subset strings</h1>
<div id="str_sub-extract-and-replace-substrings-from-a-character-vector" class="section level2">
<h2><code>str_sub()</code>: Extract and replace substrings from a
character vector</h2>
<p>What if we want to grab part of a string?</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>hw <span class="ot"><-</span> <span class="st">"Hadley Wickham"</span></span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="fu">substr</span>(hw, <span class="at">start =</span> <span class="dv">1</span>, <span class="at">stop =</span> <span class="dv">6</span>)</span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley"</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="fu">substring</span>(hw, <span class="at">first =</span> <span class="dv">1</span>) </span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham"</span></span>
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(hw, <span class="at">start =</span> <span class="dv">1</span>, <span class="at">end =</span> <span class="dv">6</span>)</span>
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley"</span></span>
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(hw, <span class="at">start =</span> <span class="dv">1</span>)</span>
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham"</span></span>
<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(hw, <span class="at">end =</span> <span class="dv">6</span>)</span>
<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley"</span></span></code></pre></div>
<p>In base you could use <code>substr()</code> or
<code>substring()</code>. The former requires both a start and stop of
the substring while the latter assumes the stop will be the end of the
string. The stringr version, <code>str_sub()</code> has the same
functionality, but also gives a default start value (the beginning of
the string). Both the base and stringr functions have the same order of
expected inputs.</p>
<p>In stringr you can use negative numbers to index from the right-hand
side string: -1 is the last letter, -2 is the second to last, and so
on.</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(hw, <span class="at">start =</span> <span class="dv">1</span>, <span class="at">end =</span> <span class="sc">-</span><span class="dv">1</span>)</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham"</span></span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(hw, <span class="at">start =</span> <span class="sc">-</span><span class="dv">5</span>, <span class="at">end =</span> <span class="sc">-</span><span class="dv">2</span>)</span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "ckha"</span></span></code></pre></div>
<p>Both base R and stringr subset are vectorized over their parameters.
This means you can either choose the same subset across multiple strings
or specify different subsets for different strings.</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>al <span class="ot"><-</span> <span class="st">"Ada Lovelace"</span></span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="fu">substr</span>(<span class="fu">c</span>(hw,al), <span class="at">start =</span> <span class="dv">1</span>, <span class="at">stop =</span> <span class="dv">6</span>)</span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley" "Ada Lo"</span></span>
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="fu">substr</span>(<span class="fu">c</span>(hw,al), <span class="at">start =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>), <span class="at">stop =</span> <span class="fu">c</span>(<span class="dv">6</span>,<span class="dv">7</span>))</span>
<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley" "Ada Lov"</span></span>
<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(<span class="fu">c</span>(hw,al), <span class="at">start =</span> <span class="dv">1</span>, <span class="at">end =</span> <span class="sc">-</span><span class="dv">1</span>)</span>
<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham" "Ada Lovelace"</span></span>
<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(<span class="fu">c</span>(hw,al), <span class="at">start =</span> <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>), <span class="at">end =</span> <span class="fu">c</span>(<span class="sc">-</span><span class="dv">1</span>,<span class="sc">-</span><span class="dv">2</span>))</span>
<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham" "Ada Lovelac"</span></span></code></pre></div>
<p>stringr will automatically recycle the first argument to the same
length as <code>start</code> and <code>stop</code>:</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(hw, <span class="at">start =</span> <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>)</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham" "adley Wickham" "dley Wickham" "ley Wickham" </span></span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [5] "ey Wickham"</span></span></code></pre></div>
<p>Whereas the base equivalent silently uses just the first value:</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">substr</span>(hw, <span class="at">start =</span> <span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>, <span class="at">stop =</span> <span class="dv">15</span>)</span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "Hadley Wickham"</span></span></code></pre></div>
</div>
<div id="str_sub---subset-assignment" class="section level2">
<h2><code>str_sub() <-</code>: Subset assignment</h2>
<p><code>substr()</code> behaves in a surprising way when you replace a
substring with a different number of characters:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"ABCDEF"</span></span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="fu">substr</span>(x, <span class="dv">1</span>, <span class="dv">3</span>) <span class="ot"><-</span> <span class="st">"x"</span></span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>x</span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "xBCDEF"</span></span></code></pre></div>
<p><code>str_sub()</code> does what you would expect:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"ABCDEF"</span></span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="fu">str_sub</span>(x, <span class="dv">1</span>, <span class="dv">3</span>) <span class="ot"><-</span> <span class="st">"x"</span></span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>x</span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "xDEF"</span></span></code></pre></div>
</div>
<div id="str_subset-keep-strings-matching-a-pattern-or-find-positions" class="section level2">
<h2><code>str_subset()</code>: Keep strings matching a pattern, or find
positions</h2>
<p>We may want to retrieve strings that contain a pattern of
interest:</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="fu">grep</span>(<span class="at">pattern =</span> <span class="st">"g"</span>, <span class="at">x =</span> fruit, <span class="at">value =</span> <span class="cn">TRUE</span>)</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> character(0)</span></span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="fu">str_subset</span>(fruit, <span class="at">pattern =</span> <span class="st">"g"</span>)</span>
<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> character(0)</span></span></code></pre></div>
</div>
<div id="str_extract-extract-matching-patterns-from-a-string" class="section level2">
<h2><code>str_extract()</code>: Extract matching patterns from a
string</h2>
<p>We may want to pick out certain patterns from a string, for example,
the digits in a shopping list:</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>shopping_list <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"apples x4"</span>, <span class="st">"bag of flour"</span>, <span class="st">"10"</span>, <span class="st">"milk x2"</span>)</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a>matches <span class="ot"><-</span> <span class="fu">regexpr</span>(<span class="at">pattern =</span> <span class="st">"</span><span class="sc">\\</span><span class="st">d+"</span>, <span class="at">text =</span> shopping_list) <span class="co"># digits</span></span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="fu">regmatches</span>(shopping_list, <span class="at">m =</span> matches)</span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "4" "10" "2"</span></span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a>matches <span class="ot"><-</span> <span class="fu">gregexpr</span>(<span class="at">pattern =</span> <span class="st">"[a-z]+"</span>, <span class="at">text =</span> shopping_list) <span class="co"># words</span></span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="fu">regmatches</span>(shopping_list, <span class="at">m =</span> matches)</span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "apples" "x" </span></span>
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "bag" "of" "flour"</span></span>
<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[3]]</span></span>
<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> character(0)</span></span>
<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[4]]</span></span>
<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "milk" "x"</span></span>
<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a><span class="fu">str_extract</span>(shopping_list, <span class="at">pattern =</span> <span class="st">"</span><span class="sc">\\</span><span class="st">d+"</span>) </span>
<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "4" NA "10" "2"</span></span>
<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a><span class="fu">str_extract_all</span>(shopping_list, <span class="st">"[a-z]+"</span>)</span>
<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[1]]</span></span>
<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "apples" "x" </span></span>
<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[2]]</span></span>
<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "bag" "of" "flour"</span></span>
<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[3]]</span></span>
<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a><span class="co">#> character(0)</span></span>
<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span>
<span id="cb14-35"><a href="#cb14-35" aria-hidden="true" tabindex="-1"></a><span class="co">#> [[4]]</span></span>
<span id="cb14-36"><a href="#cb14-36" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "milk" "x"</span></span></code></pre></div>
<p>Base R requires the combination of <code>regexpr()</code> with
<code>regmatches()</code>; but note that the strings without matches are
dropped from the output. stringr provides <code>str_extract()</code> and
<code>str_extract_all()</code>, and the output is always the same length
as the input.</p>
</div>
<div id="str_match-extract-matched-groups-from-a-string" class="section level2">
<h2><code>str_match()</code>: Extract matched groups from a string</h2>
<p>We may also want to extract groups from a string. Here I’m going to
use the scenario from Section 14.4.3 in <a href="https://r4ds.had.co.nz/strings.html">R for Data Science</a>.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(sentences)</span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "The birch canoe slid on the smooth planks." </span></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2] "Glue the sheet to the dark blue background."</span></span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [3] "It's easy to tell the depth of a well." </span></span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> [4] "These days a chicken leg is a rare dish." </span></span>
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [5] "Rice is often served in round bowls." </span></span>
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> [6] "The juice of lemons makes fine punch."</span></span>
<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a>noun <span class="ot"><-</span> <span class="st">"([A]a|[Tt]he) ([^ ]+)"</span></span>
<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a>matches <span class="ot"><-</span> <span class="fu">regexec</span>(<span class="at">pattern =</span> noun, <span class="at">text =</span> <span class="fu">head</span>(sentences))</span>
<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a><span class="fu">do.call</span>(<span class="st">"rbind"</span>, <span class="fu">regmatches</span>(<span class="at">x =</span> <span class="fu">head</span>(sentences), <span class="at">m =</span> matches))</span>
<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] </span></span>
<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1,] "The birch" "The" "birch"</span></span>
<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2,] "the sheet" "the" "sheet"</span></span>
<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> [3,] "the depth" "the" "depth"</span></span>
<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> [4,] "The juice" "The" "juice"</span></span>
<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a><span class="fu">str_match</span>(<span class="fu">head</span>(sentences), <span class="at">pattern =</span> noun)</span>
<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a><span class="co">#> [,1] [,2] [,3] </span></span>
<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1,] "The birch" "The" "birch"</span></span>
<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2,] "the sheet" "the" "sheet"</span></span>
<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a><span class="co">#> [3,] "the depth" "the" "depth"</span></span>
<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a><span class="co">#> [4,] NA NA NA </span></span>
<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a><span class="co">#> [5,] NA NA NA </span></span>
<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a><span class="co">#> [6,] "The juice" "The" "juice"</span></span></code></pre></div>
<p>As for extracting the full match base R requires the combination of
two functions, and inputs with no matches are dropped from the
output.</p>
</div>
</div>
<div id="manage-lengths" class="section level1">
<h1>Manage lengths</h1>
<div id="str_length-the-length-of-a-string" class="section level2">
<h2><code>str_length()</code>: The length of a string</h2>
<p>To determine the length of a string, base R uses <code>nchar()</code>
(not to be confused with <code>length()</code> which gives the length of
vectors, etc.) while stringr uses <code>str_length()</code>.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="fu">nchar</span>(letters)</span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1</span></span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="fu">str_length</span>(letters)</span>
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1</span></span></code></pre></div>
<p>There are some subtle differences between base and stringr here.
<code>nchar()</code> requires a character vector, so it will return an
error if used on a factor. <code>str_length()</code> can handle a factor
input.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># base</span></span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="fu">nchar</span>(<span class="fu">factor</span>(<span class="st">"abc"</span>)) </span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> Error in nchar(factor("abc")): 'nchar()' requires a character vector</span></span></code></pre></div>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="co"># stringr</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="fu">str_length</span>(<span class="fu">factor</span>(<span class="st">"abc"</span>))</span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 3</span></span></code></pre></div>
<p>Note that “characters” is a poorly defined concept, and technically
both <code>nchar()</code> and <code>str_length()</code> returns the
number of code points. This is usually the same as what you’d consider
to be a charcter, but not always:</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>x <span class="ot"><-</span> <span class="fu">c</span>(<span class="st">"\u00fc"</span>, <span class="st">"u\u0308"</span>)</span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>x</span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "ü" "ü"</span></span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a><span class="fu">nchar</span>(x)</span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1 2</span></span>
<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1