skimr
Version:
CLI EDA for CSVs
612 lines (576 loc) • 37.9 kB
HTML
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Locales</title>
<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
var i, h, a;
for (i = 0; i < hs.length; i++) {
h = hs[i];
if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
a = h.attributes;
while (a.length > 0) h.removeAttribute(a[0].name);
}
});
</script>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; }
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.at { color: #7d9029; }
code span.bn { color: #40a070; }
code span.bu { color: #008000; }
code span.cf { color: #007020; font-weight: bold; }
code span.ch { color: #4070a0; }
code span.cn { color: #880000; }
code span.co { color: #60a0b0; font-style: italic; }
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.do { color: #ba2121; font-style: italic; }
code span.dt { color: #902000; }
code span.dv { color: #40a070; }
code span.er { color: #ff0000; font-weight: bold; }
code span.ex { }
code span.fl { color: #40a070; }
code span.fu { color: #06287e; }
code span.im { color: #008000; font-weight: bold; }
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.kw { color: #007020; font-weight: bold; }
code span.op { color: #666666; }
code span.ot { color: #007020; }
code span.pp { color: #bc7a00; }
code span.sc { color: #4070a0; }
code span.ss { color: #bb6688; }
code span.st { color: #4070a0; }
code span.va { color: #19177c; }
code span.vs { color: #4070a0; }
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
var sheets = document.styleSheets;
for (var i = 0; i < sheets.length; i++) {
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
var j = 0;
while (j < rules.length) {
var rule = rules[j];
// check if there is a div.sourceCode rule
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
j++;
continue;
}
var style = rule.style.cssText;
// check if color or background-color is set
if (rule.style.color === '' && rule.style.backgroundColor === '') {
j++;
continue;
}
// replace div.sourceCode by a pre.sourceCode rule
sheets[i].deleteRule(j);
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
}
}
})();
</script>
<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap;
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }
code > span.kw { color: #555; font-weight: bold; }
code > span.dt { color: #902000; }
code > span.dv { color: #40a070; }
code > span.bn { color: #d14; }
code > span.fl { color: #d14; }
code > span.ch { color: #d14; }
code > span.st { color: #d14; }
code > span.co { color: #888888; font-style: italic; }
code > span.ot { color: #007020; }
code > span.al { color: #ff0000; font-weight: bold; }
code > span.fu { color: #900; font-weight: bold; }
code > span.er { color: #a61717; background-color: #e3d2d2; }
</style>
</head>
<body>
<h1 class="title toc-ignore">Locales</h1>
<p>The goal of readr’s locales is to encapsulate common options that
vary between languages and localities. This includes:</p>
<ul>
<li>The names of months and days, used when parsing dates.</li>
<li>The default time zone, used when parsing datetimes.</li>
<li>The character encoding, used when reading non-ASCII strings.</li>
<li>Default date format, used when guessing column types.</li>
<li>The decimal and grouping marks, used when reading numbers.</li>
</ul>
<p>(Strictly speaking these are not locales in the usual technical sense
of the word because they also contain information about time zones and
encoding.)</p>
<p>To create a new locale, you use the <code>locale()</code>
function:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">locale</span>()</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> <locale></span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> Numbers: 123,456.78</span></span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> Formats: %AD / %AT</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> Timezone: UTC</span></span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> Encoding: UTF-8</span></span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> <date_names></span></span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday</span></span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> (Thu), Friday (Fri), Saturday (Sat)</span></span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),</span></span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> June (Jun), July (Jul), August (Aug), September (Sep), October</span></span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> (Oct), November (Nov), December (Dec)</span></span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> AM/PM: AM/PM</span></span></code></pre></div>
<p>This rest of this vignette will explain what each of the options
do.</p>
<p>All of the parsing function in readr take a <code>locale</code>
argument. You’ll most often use it with <code>read_csv()</code>,
<code>read_fwf()</code> or <code>read_table()</code>. Readr is designed
to work the same way across systems, so the default locale is English
centric like R. If you’re not in an English speaking country, this makes
initial import a little harder, because you have to override the
defaults. But the payoff is big: you can share your code and know that
it will work on any other system. Base R takes a different philosophy.
It uses system defaults, so typical data import is a little easier, but
sharing code is harder.</p>
<p>Rather than demonstrating the use of locales with
<code>read_csv()</code> and fields, in this vignette I’m going to use
the <code>parse_*()</code> functions. These work with a character vector
instead of a file on disk, so they’re easier to use in examples. They’re
also useful in their own right if you need to do custom parsing. See
<code>type_convert()</code> if you need to apply multiple parsers to a
data frame.</p>
<div id="dates-and-times" class="section level2">
<h2>Dates and times</h2>
<div id="names-of-months-and-days" class="section level3">
<h3>Names of months and days</h3>
<p>The first argument to <code>locale()</code> is
<code>date_names</code>, and it controls what values are used for month
and day names. The easiest way to specify it is with a ISO 639 language
code:</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">locale</span>(<span class="st">"ko"</span>) <span class="co"># Korean</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> <locale></span></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> Numbers: 123,456.78</span></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> Formats: %AD / %AT</span></span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> Timezone: UTC</span></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> Encoding: UTF-8</span></span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> <date_names></span></span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> Days: 일요일 (일), 월요일 (월), 화요일 (화), 수요일 (수), 목요일 (목), 금요일</span></span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> (금), 토요일 (토)</span></span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> Months: 1월, 2월, 3월, 4월, 5월, 6월, 7월, 8월, 9월, 10월, 11월, 12월</span></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> AM/PM: 오전/오후</span></span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="fu">locale</span>(<span class="st">"fr"</span>) <span class="co"># French</span></span>
<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> <locale></span></span>
<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a><span class="co">#> Numbers: 123,456.78</span></span>
<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a><span class="co">#> Formats: %AD / %AT</span></span>
<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a><span class="co">#> Timezone: UTC</span></span>
<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a><span class="co">#> Encoding: UTF-8</span></span>
<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a><span class="co">#> <date_names></span></span>
<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a><span class="co">#> Days: dimanche (dim.), lundi (lun.), mardi (mar.), mercredi (mer.), jeudi</span></span>
<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a><span class="co">#> (jeu.), vendredi (ven.), samedi (sam.)</span></span>
<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a><span class="co">#> Months: janvier (janv.), février (févr.), mars (mars), avril (avr.), mai (mai),</span></span>
<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a><span class="co">#> juin (juin), juillet (juil.), août (août), septembre (sept.),</span></span>
<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a><span class="co">#> octobre (oct.), novembre (nov.), décembre (déc.)</span></span>
<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a><span class="co">#> AM/PM: AM/PM</span></span></code></pre></div>
<p>If you don’t already know the code for your language, <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">Wikipedia</a>
has a good list. Currently readr has 185 languages available. You can
list them all with <code>date_names_langs()</code>.</p>
<p>Specifying a locale allows you to parse dates in other languages:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_date</span>(<span class="st">"1 janvier 2015"</span>, <span class="st">"%d %B %Y"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="st">"fr"</span>))</span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "2015-01-01"</span></span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_date</span>(<span class="st">"14 oct. 1979"</span>, <span class="st">"%d %b %Y"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="st">"fr"</span>))</span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "1979-10-14"</span></span></code></pre></div>
<p>For many languages, it’s common to find that diacritics have been
stripped so they can be stored as ASCII. You can tell the locale that
with the <code>asciify</code> option:</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_date</span>(<span class="st">"1 août 2015"</span>, <span class="st">"%d %B %Y"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="st">"fr"</span>))</span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "2015-08-01"</span></span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_date</span>(<span class="st">"1 aout 2015"</span>, <span class="st">"%d %B %Y"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="st">"fr"</span>, <span class="at">asciify =</span> <span class="cn">TRUE</span>))</span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "2015-08-01"</span></span></code></pre></div>
<p>Note that the quality of the translations is variable, especially for
the rarer languages. If you discover that they’re not quite right for
your data, you can create your own with <code>date_names()</code>. The
following example creates a locale with Māori date names:</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>maori <span class="ot"><-</span> <span class="fu">locale</span>(<span class="fu">date_names</span>(</span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a> <span class="at">day =</span> <span class="fu">c</span>(<span class="st">"Rātapu"</span>, <span class="st">"Rāhina"</span>, <span class="st">"Rātū"</span>, <span class="st">"Rāapa"</span>, <span class="st">"Rāpare"</span>, <span class="st">"Rāmere"</span>, <span class="st">"Rāhoroi"</span>),</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> <span class="at">mon =</span> <span class="fu">c</span>(<span class="st">"Kohi-tātea"</span>, <span class="st">"Hui-tanguru"</span>, <span class="st">"Poutū-te-rangi"</span>, <span class="st">"Paenga-whāwhā"</span>,</span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a> <span class="st">"Haratua"</span>, <span class="st">"Pipiri"</span>, <span class="st">"Hōngongoi"</span>, <span class="st">"Here-turi-kōkā"</span>, <span class="st">"Mahuru"</span>,</span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a> <span class="st">"Whiringa-ā-nuku"</span>, <span class="st">"Whiringa-ā-rangi"</span>, <span class="st">"Hakihea"</span>)</span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>))</span></code></pre></div>
</div>
<div id="timezones" class="section level3">
<h3>Timezones</h3>
<p>Unless otherwise specified, readr assumes that times are in UTC, the
Universal Coordinated Time (this is a successor to GMT and for almost
all intents is identical). UTC is most suitable for data because it
doesn’t have daylight savings - this avoids a whole class of potential
problems. If your data isn’t already in UTC, you’ll need to supply a
<code>tz</code> in the locale:</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_datetime</span>(<span class="st">"2001-10-10 20:10"</span>)</span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "2001-10-10 20:10:00 UTC"</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_datetime</span>(<span class="st">"2001-10-10 20:10"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">tz =</span> <span class="st">"Pacific/Auckland"</span>))</span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "2001-10-10 20:10:00 NZDT"</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_datetime</span>(<span class="st">"2001-10-10 20:10"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">tz =</span> <span class="st">"Europe/Dublin"</span>))</span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "2001-10-10 20:10:00 IST"</span></span></code></pre></div>
<p>You can see a complete list of time zones with
<code>OlsonNames()</code>.</p>
<p>If you’re American, note that “EST” is a Canadian time zone that does
not have DST. It’s not Eastern Standard Time! Instead use:</p>
<ul>
<li>PST/PDT = “US/Pacific”</li>
<li>CST/CDT = “US/Central”</li>
<li>MST/MDT = “US/Mountain”</li>
<li>EST/EDT = “US/Eastern”</li>
</ul>
<p>(Note that there are more specific time zones for smaller areas that
don’t follow the same rules. For example, “US/Arizona”, which follows
mostly follows mountain time, but doesn’t have daylight savings. If
you’re dealing with historical data, you might need an even more
specific zone like “America/North_Dakota/New_Salem” - that will get you
the most accurate time zones.)</p>
<p>Note that these are only used as defaults. If individual times have
timezones and you’re using “%Z” (as name, e.g. “America/Chicago”) or
“%z” (as offset from UTC, e.g. “+0800”), they’ll override the defaults.
There’s currently no good way to parse times that use US
abbreviations.</p>
<p>Note that once you have the date in R, changing the time zone just
changes its printed representation - it still represents the same
instants of time. If you’ve loaded non-UTC data, and want to display it
as UTC, try this snippet of code:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>is_datetime <span class="ot"><-</span> <span class="fu">sapply</span>(df, inherits, <span class="st">"POSIXct"</span>)</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>df[is_datetime] <span class="ot"><-</span> <span class="fu">lapply</span>(df[is_datetime], <span class="cf">function</span>(x) {</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">attr</span>(x, <span class="st">"tzone"</span>) <span class="ot"><-</span> <span class="st">"UTC"</span></span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> x</span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>})</span></code></pre></div>
</div>
<div id="default-formats" class="section level3">
<h3>Default formats</h3>
<p>Locales also provide default date and time formats. The date format
is used when guessing column types. The default date format is
<code>%AD</code>, a flexible YMD parser (see
<code>?parse_date</code>):</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"2010-10-10"</span>))</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> Date[1:1], format: "2010-10-10"</span></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"2010/10/10"</span>))</span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> Date[1:1], format: "2010-10-10"</span></span></code></pre></div>
<p>If you’re an American, you might want to use your illogical date
system::</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"01/31/2013"</span>))</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> chr "01/31/2013"</span></span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"01/31/2013"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">date_format =</span> <span class="st">"%m/%d/%Y"</span>)))</span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> Date[1:1], format: "2013-01-31"</span></span></code></pre></div>
<p>The time format is also used when guessing column types. The default
time format is <code>%AT</code>, a flexible HMS parser (see
<code>?parse_time</code>):</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"17:55:14"</span>))</span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> 'hms' num 17:55:14</span></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> - attr(*, "units")= chr "secs"</span></span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"5:55:14 PM"</span>))</span>
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> 'hms' num 17:55:14</span></span>
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> - attr(*, "units")= chr "secs"</span></span>
<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Example of a non-standard time</span></span>
<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"h5m55s14 PM"</span>))</span>
<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> chr "h5m55s14 PM"</span></span>
<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(<span class="fu">parse_guess</span>(<span class="st">"h5m55s14 PM"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">time_format =</span> <span class="st">"h%Hm%Ms%S %p"</span>)))</span>
<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> 'hms' num 17:55:14</span></span>
<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> - attr(*, "units")= chr "secs"</span></span></code></pre></div>
</div>
</div>
<div id="character" class="section level2">
<h2>Character</h2>
<p>All readr functions yield strings encoded in UTF-8. This encoding is
the most likely to give good results in the widest variety of settings.
By default, readr assumes that your input is also in UTF-8. This is less
likely to be the case, especially when you’re working with older
datasets.</p>
<p>The following code illustrates the problems with encodings:</p>
<p><!– not currently evaluating the next two chunks due to <a href="https://github.com/tidyverse/readr/issues/1337" class="uri">https://github.com/tidyverse/readr/issues/1337</a>
–!></p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringi)</span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>x <span class="ot"><-</span> <span class="st">"Émigré cause célèbre déjà vu.</span><span class="sc">\n</span><span class="st">"</span></span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>y <span class="ot"><-</span> <span class="fu">stri_conv</span>(x, <span class="st">"UTF-8"</span>, <span class="st">"latin1"</span>)</span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="co"># These strings look like they're identical:</span></span>
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>x</span>
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>y</span>
<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a><span class="fu">identical</span>(x, y)</span>
<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a><span class="co"># But they have difference encodings:</span></span>
<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a><span class="fu">Encoding</span>(x)</span>
<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a><span class="fu">Encoding</span>(y)</span>
<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a><span class="co"># That means while they print the same, their raw (binary)</span></span>
<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a><span class="co"># representation is actually quite different:</span></span>
<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a><span class="fu">charToRaw</span>(x)</span>
<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a><span class="fu">charToRaw</span>(y)</span>
<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a><span class="co"># readr expects strings to be encoded as UTF-8. If they're</span></span>
<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a><span class="co"># not, you'll get weird characters</span></span>
<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_character</span>(x)</span>
<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_character</span>(y)</span>
<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-24"><a href="#cb11-24" aria-hidden="true" tabindex="-1"></a><span class="co"># If you know the encoding, supply it:</span></span>
<span id="cb11-25"><a href="#cb11-25" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_character</span>(y, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">encoding =</span> <span class="st">"latin1"</span>))</span></code></pre></div>
<p>If you don’t know what encoding the file uses, try <a href="https://readr.tidyverse.org/reference/encoding.html"><code>guess_encoding()</code></a>.
It’s not 100% perfect (as it’s fundamentally a heuristic), but should at
least get you pointed in the right direction:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">guess_encoding</span>(x)</span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="fu">guess_encoding</span>(y)</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Note that the first guess produces a valid string, but isn't correct:</span></span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_character</span>(y, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">encoding =</span> <span class="st">"ISO-8859-2"</span>))</span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="co"># But ISO-8859-1 is another name for latin1</span></span>
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_character</span>(y, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">encoding =</span> <span class="st">"ISO-8859-1"</span>))</span></code></pre></div>
</div>
<div id="numbers" class="section level2">
<h2>Numbers</h2>
<p>Some countries use the decimal point, while others use the decimal
comma. The <code>decimal_mark</code> option controls which readr uses
when parsing doubles:</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_double</span>(<span class="st">"1,23"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">decimal_mark =</span> <span class="st">","</span>))</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1.23</span></span></code></pre></div>
<p>Additionally, when writing out big numbers, you might have
<code>1,000,000</code>, <code>1.000.000</code>, <code>1 000 000</code>,
or <code>1'000'000</code>. The grouping mark is ignored by the more
flexible number parser:</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_number</span>(<span class="st">"$1,234.56"</span>)</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1234.56</span></span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_number</span>(<span class="st">"$1.234,56"</span>, </span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a> <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">decimal_mark =</span> <span class="st">","</span>, <span class="at">grouping_mark =</span> <span class="st">"."</span>)</span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>)</span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1234.56</span></span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="co"># readr is smart enough to guess that if you're using , for decimals then</span></span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="co"># you're probably using . for grouping:</span></span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="fu">parse_number</span>(<span class="st">"$1.234,56"</span>, <span class="at">locale =</span> <span class="fu">locale</span>(<span class="at">decimal_mark =</span> <span class="st">","</span>))</span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 1234.56</span></span></code></pre></div>
</div>
<!-- code folding -->
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>