skimr

<!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta name="generator" content="pandoc" /> <meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="date" content="2022-12-23" /> <title>Extending skimr</title> <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); </script> <style type="text/css"> code{white-space: pre-wrap;} span.smallcaps{font-variant: small-caps;} span.underline{text-decoration: underline;} div.column{display: inline-block; vertical-align: top; width: 50%;} div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} ul.task-list{list-style: none;} </style> <style type="text/css"> code { white-space: pre; } .sourceCode { overflow: visible; } </style> <style type="text/css" data-origin="pandoc"> pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } .sourceCode { overflow: visible; } code.sourceCode > span { color: inherit; text-decoration: inherit; } div.sourceCode { margin: 1em 0; } pre.sourceCode { margin: 0; } @media screen { div.sourceCode { overflow: auto; } } @media print { pre > code.sourceCode { white-space: pre-wrap; } pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } } pre.numberSource code { counter-reset: source-line 0; } pre.numberSource code > span { position: relative; left: -4em; counter-increment: source-line; } pre.numberSource code > span > a:first-child::before { content: counter(source-line); position: relative; left: -1em; text-align: right; vertical-align: baseline; border: none; display: inline-block; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; color: #aaaaaa; } pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } code span.al { color: #ff0000; font-weight: bold; } code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } code span.at { color: #7d9029; } code span.bn { color: #40a070; } code span.bu { color: #008000; } code span.cf { color: #007020; font-weight: bold; } code span.ch { color: #4070a0; } code span.cn { color: #880000; } code span.co { color: #60a0b0; font-style: italic; } code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } code span.do { color: #ba2121; font-style: italic; } code span.dt { color: #902000; } code span.dv { color: #40a070; } code span.er { color: #ff0000; font-weight: bold; } code span.ex { } code span.fl { color: #40a070; } code span.fu { color: #06287e; } code span.im { color: #008000; font-weight: bold; } code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } code span.kw { color: #007020; font-weight: bold; } code span.op { color: #666666; } code span.ot { color: #007020; } code span.pp { color: #bc7a00; } code span.sc { color: #4070a0; } code span.ss { color: #bb6688; } code span.st { color: #4070a0; } code span.va { color: #19177c; } code span.vs { color: #4070a0; } code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } </style> <script> // apply pandoc div.sourceCode style to pre.sourceCode instead (function() { var sheets = document.styleSheets; for (var i = 0; i < sheets.length; i++) { if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue; try { var rules = sheets[i].cssRules; } catch (e) { continue; } for (var j = 0; j < rules.length; j++) { var rule = rules[j]; // check if there is a div.sourceCode rule if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue; var style = rule.style.cssText; // check if color or background-color is set if (rule.style.color === '' && rule.style.backgroundColor === '') continue; // replace div.sourceCode by a pre.sourceCode rule sheets[i].deleteRule(j); sheets[i].insertRule('pre.sourceCode{' + style + '}', j); } } })(); </script> <style type="text/css">body { background-color: #fff; margin: 1em auto; max-width: 700px; overflow: visible; padding-left: 2em; padding-right: 2em; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; font-size: 14px; line-height: 1.35; } #TOC { clear: both; margin: 0 0 10px 10px; padding: 4px; width: 400px; border: 1px solid #CCCCCC; border-radius: 5px; background-color: #f6f6f6; font-size: 13px; line-height: 1.3; } #TOC .toctitle { font-weight: bold; font-size: 15px; margin-left: 5px; } #TOC ul { padding-left: 40px; margin-left: -1.5em; margin-top: 5px; margin-bottom: 5px; } #TOC ul ul { margin-left: -2em; } #TOC li { line-height: 16px; } table { margin: 1em auto; border-width: 1px; border-color: #DDDDDD; border-style: outset; border-collapse: collapse; } table th { border-width: 2px; padding: 5px; border-style: inset; } table td { border-width: 1px; border-style: inset; line-height: 18px; padding: 5px 5px; } table, table th, table td { border-left-style: none; border-right-style: none; } table thead, table tr.even { background-color: #f7f7f7; } p { margin: 0.5em 0; } blockquote { background-color: #f6f6f6; padding: 0.25em 0.75em; } hr { border-style: solid; border: none; border-top: 1px solid #777; margin: 28px 0; } dl { margin-left: 0; } dl dd { margin-bottom: 13px; margin-left: 13px; } dl dt { font-weight: bold; } ul { margin-top: 0; } ul li { list-style: circle outside; } ul ul { margin-bottom: 0; } pre, code { background-color: #f7f7f7; border-radius: 3px; color: #333; white-space: pre-wrap; } pre { border-radius: 3px; margin: 5px 0px 10px 0px; padding: 10px; } pre:not([class]) { background-color: #f7f7f7; } code { font-family: Consolas, Monaco, 'Courier New', monospace; font-size: 85%; } p > code, li > code { padding: 2px 0px; } div.figure { text-align: center; } img { background-color: #FFFFFF; padding: 2px; border: 1px solid #DDDDDD; border-radius: 3px; border: 1px solid #CCCCCC; margin: 0 5px; } h1 { margin-top: 0; font-size: 35px; line-height: 40px; } h2 { border-bottom: 4px solid #f7f7f7; padding-top: 10px; padding-bottom: 2px; font-size: 145%; } h3 { border-bottom: 2px solid #f7f7f7; padding-top: 10px; font-size: 120%; } h4 { border-bottom: 1px solid #f7f7f7; margin-left: 8px; font-size: 105%; } h5, h6 { border-bottom: 1px solid #ccc; font-size: 105%; } a { color: #0033dd; text-decoration: none; } a:hover { color: #6666ff; } a:visited { color: #800080; } a:visited:hover { color: #BB00BB; } a[href^="http:"] { text-decoration: underline; } a[href^="https:"] { text-decoration: underline; } code > span.kw { color: #555; font-weight: bold; } code > span.dt { color: #902000; } code > span.dv { color: #40a070; } code > span.bn { color: #d14; } code > span.fl { color: #d14; } code > span.ch { color: #d14; } code > span.st { color: #d14; } code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } </style> </head> <body> <h1 class="title toc-ignore">Extending skimr</h1> <h4 class="date">2022-12-23</h4> <div id="introduction" class="section level2"> <h2>Introduction</h2> <p>The <code>skim()</code> function summarizes data types contained within data frames and objects that have <code>as.data.frame()</code> methods to coerce them into data frames. It comes with a set of default summary functions for a wide variety of data types, but this is not comprehensive.</p> <p>Package authors (and advanced users) can add support for skimming their specific non-data-frame objects in their packages, and they can provide different defaults in their own summary functions. This will require including skimr as a dependency.</p> </div> <div id="skimming-objects-that-are-not-coercible-to-data-frames" class="section level1"> <h1>Skimming objects that are not coercible to data frames</h1> <p>This example will illustrate this by creating support for the <code>lm</code> object produced by <code>lm()</code>. For any object this involves two required elements and one optional element. This is a simple example, but for other types of objects there may be much more complexity</p> <p>If you are adding skim support to a package you will also need to add <code>skimr</code> to the list of imports.</p> <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(skimr)</span></code></pre></div> <p>The <code>lm()</code> function produces a complex object with class “lm”.</p> <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>results <span class="ot"><-</span> <span class="fu">lm</span>(weight <span class="sc">~</span> feed, <span class="at">data =</span> chickwts)</span> <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(results)</span></code></pre></div> <pre><code>## [1] "lm"</code></pre> <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">attributes</span>(results)</span></code></pre></div> <pre><code>## $names ## [1] "coefficients" "residuals" "effects" "rank" ## [5] "fitted.values" "assign" "qr" "df.residual" ## [9] "contrasts" "xlevels" "call" "terms" ## [13] "model" ## ## $class ## [1] "lm"</code></pre> <p>There is no as.data.frame method for an <code>lm</code> object.</p> <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">as.data.frame</span>(results)</span> <span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> Error in as.data.frame.default(results) :</span></span> <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> cannot coerce class ‘"lm"’ to a data.frame</span></span></code></pre></div> <p>Unlike the example of having a new type of data in a column of a simple data frame (for which we would create a <code>sfl</code>) frame in the “Using skimr” vignette, this is a different type of challenge: an object that we might wish to skim, but that cannot be directly skimmed. Therefore we need to make it into an object that is either a data frame or coercible to a data frame.</p> <p>In the case of the lm object, the <code>model</code> attribute is already a data frame. So a very simple way to solve the challenge is to skim <code>results$model</code> directly.</p> <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">skim</span>(results<span class="sc">$</span>model)</span></code></pre></div> <table> <caption>Data summary</caption> <tbody> <tr class="odd"> <td align="left">Name</td> <td align="left">results$model</td> </tr> <tr class="even"> <td align="left">Number of rows</td> <td align="left">71</td> </tr> <tr class="odd"> <td align="left">Number of columns</td> <td align="left">2</td> </tr> <tr class="even"> <td align="left">_______________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Column type frequency:</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">factor</td> <td align="left">1</td> </tr> <tr class="odd"> <td align="left">numeric</td> <td align="left">1</td> </tr> <tr class="even"> <td align="left">________________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Group variables</td> <td align="left">None</td> </tr> </tbody> </table> <p><strong>Variable type: factor</strong></p> <table> <colgroup> <col width="15%" /> <col width="11%" /> <col width="15%" /> <col width="8%" /> <col width="10%" /> <col width="38%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="left">ordered</th> <th align="right">n_unique</th> <th align="left">top_counts</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">feed</td> <td align="right">0</td> <td align="right">1</td> <td align="left">FALSE</td> <td align="right">6</td> <td align="left">soy: 14, cas: 12, lin: 12, sun: 12</td> </tr> </tbody> </table> <p><strong>Variable type: numeric</strong></p> <table style="width:100%;"> <colgroup> <col width="17%" /> <col width="12%" /> <col width="17%" /> <col width="8%" /> <col width="7%" /> <col width="4%" /> <col width="7%" /> <col width="4%" /> <col width="7%" /> <col width="6%" /> <col width="7%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="right">mean</th> <th align="right">sd</th> <th align="right">p0</th> <th align="right">p25</th> <th align="right">p50</th> <th align="right">p75</th> <th align="right">p100</th> <th align="left">hist</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">weight</td> <td align="right">0</td> <td align="right">1</td> <td align="right">261.31</td> <td align="right">78.07</td> <td align="right">108</td> <td align="right">204.5</td> <td align="right">258</td> <td align="right">323.5</td> <td align="right">423</td> <td align="left">▆▆▇▇▃</td> </tr> </tbody> </table> <p>This is works, but we could go one step further and create a new function for doing this directly.</p> <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>skim_lm <span class="ot"><-</span> <span class="cf">function</span>(.data) {</span> <span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a> .data <span class="ot"><-</span> .data<span class="sc">$</span>model</span> <span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a> skimr<span class="sc">::</span><span class="fu">skim</span>(.data)</span> <span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>}</span> <span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a></span> <span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="fu">lm</span>(weight <span class="sc">~</span> feed, <span class="at">data =</span> chickwts) <span class="sc">%>%</span> <span class="fu">skim_lm</span>()</span></code></pre></div> <table> <caption>Data summary</caption> <tbody> <tr class="odd"> <td align="left">Name</td> <td align="left">Piped data</td> </tr> <tr class="even"> <td align="left">Number of rows</td> <td align="left">71</td> </tr> <tr class="odd"> <td align="left">Number of columns</td> <td align="left">2</td> </tr> <tr class="even"> <td align="left">_______________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Column type frequency:</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">factor</td> <td align="left">1</td> </tr> <tr class="odd"> <td align="left">numeric</td> <td align="left">1</td> </tr> <tr class="even"> <td align="left">________________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Group variables</td> <td align="left">None</td> </tr> </tbody> </table> <p><strong>Variable type: factor</strong></p> <table> <colgroup> <col width="15%" /> <col width="11%" /> <col width="15%" /> <col width="8%" /> <col width="10%" /> <col width="38%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="left">ordered</th> <th align="right">n_unique</th> <th align="left">top_counts</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">feed</td> <td align="right">0</td> <td align="right">1</td> <td align="left">FALSE</td> <td align="right">6</td> <td align="left">soy: 14, cas: 12, lin: 12, sun: 12</td> </tr> </tbody> </table> <p><strong>Variable type: numeric</strong></p> <table style="width:100%;"> <colgroup> <col width="17%" /> <col width="12%" /> <col width="17%" /> <col width="8%" /> <col width="7%" /> <col width="4%" /> <col width="7%" /> <col width="4%" /> <col width="7%" /> <col width="6%" /> <col width="7%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="right">mean</th> <th align="right">sd</th> <th align="right">p0</th> <th align="right">p25</th> <th align="right">p50</th> <th align="right">p75</th> <th align="right">p100</th> <th align="left">hist</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">weight</td> <td align="right">0</td> <td align="right">1</td> <td align="right">261.31</td> <td align="right">78.07</td> <td align="right">108</td> <td align="right">204.5</td> <td align="right">258</td> <td align="right">323.5</td> <td align="right">423</td> <td align="left">▆▆▇▇▃</td> </tr> </tbody> </table> <p>If desired, a more complex function can be created. For example, the lm object also contains fitted values and residuals. We could incorporate these in the data frame.</p> <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>skim_lm <span class="ot"><-</span> <span class="cf">function</span>(.data, <span class="at">fit =</span> <span class="cn">FALSE</span>) {</span> <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> .data <span class="ot"><-</span> .data<span class="sc">$</span>model</span> <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (fit) {</span> <span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> .data <span class="ot"><-</span> .data <span class="sc">%>%</span></span> <span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a> dplyr<span class="sc">::</span><span class="fu">bind_cols</span>(</span> <span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a> <span class="at">fitted =</span> <span class="fu">data.frame</span>(results<span class="sc">$</span>fitted.values),</span> <span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a> <span class="at">residuals =</span> <span class="fu">data.frame</span>(results<span class="sc">$</span>residuals)</span> <span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a> )</span> <span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a> }</span> <span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a> skimr<span class="sc">::</span><span class="fu">skim</span>(.data)</span> <span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div> <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">skim_lm</span>(results, <span class="at">fit =</span> <span class="cn">TRUE</span>)</span></code></pre></div> <table> <caption>Data summary</caption> <tbody> <tr class="odd"> <td align="left">Name</td> <td align="left">Piped data</td> </tr> <tr class="even"> <td align="left">Number of rows</td> <td align="left">71</td> </tr> <tr class="odd"> <td align="left">Number of columns</td> <td align="left">4</td> </tr> <tr class="even"> <td align="left">_______________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Column type frequency:</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">factor</td> <td align="left">1</td> </tr> <tr class="odd"> <td align="left">numeric</td> <td align="left">3</td> </tr> <tr class="even"> <td align="left">________________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Group variables</td> <td align="left">None</td> </tr> </tbody> </table> <p><strong>Variable type: factor</strong></p> <table> <colgroup> <col width="15%" /> <col width="11%" /> <col width="15%" /> <col width="8%" /> <col width="10%" /> <col width="38%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="left">ordered</th> <th align="right">n_unique</th> <th align="left">top_counts</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">feed</td> <td align="right">0</td> <td align="right">1</td> <td align="left">FALSE</td> <td align="right">6</td> <td align="left">soy: 14, cas: 12, lin: 12, sun: 12</td> </tr> </tbody> </table> <p><strong>Variable type: numeric</strong></p> <table style="width:100%;"> <colgroup> <col width="21%" /> <col width="9%" /> <col width="13%" /> <col width="6%" /> <col width="5%" /> <col width="7%" /> <col width="6%" /> <col width="6%" /> <col width="6%" /> <col width="6%" /> <col width="5%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="right">mean</th> <th align="right">sd</th> <th align="right">p0</th> <th align="right">p25</th> <th align="right">p50</th> <th align="right">p75</th> <th align="right">p100</th> <th align="left">hist</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">weight</td> <td align="right">0</td> <td align="right">1</td> <td align="right">261.31</td> <td align="right">78.07</td> <td align="right">108.00</td> <td align="right">204.50</td> <td align="right">258.00</td> <td align="right">323.50</td> <td align="right">423.00</td> <td align="left">▆▆▇▇▃</td> </tr> <tr class="even"> <td align="left">results.fitted.values</td> <td align="right">0</td> <td align="right">1</td> <td align="right">261.31</td> <td align="right">57.46</td> <td align="right">160.20</td> <td align="right">218.75</td> <td align="right">246.43</td> <td align="right">323.58</td> <td align="right">328.92</td> <td align="left">▃▃▅▃▇</td> </tr> <tr class="odd"> <td align="left">results.residuals</td> <td align="right">0</td> <td align="right">1</td> <td align="right">0.00</td> <td align="right">52.86</td> <td align="right">-123.91</td> <td align="right">-34.41</td> <td align="right">1.57</td> <td align="right">38.17</td> <td align="right">103.09</td> <td align="left">▂▅▇▅▃</td> </tr> </tbody> </table> <p>A second example of the need for a special function is with <code>dist</code> objects. The <code>UScitiesD</code> data set is an example of this.</p> <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(UScitiesD)</span></code></pre></div> <pre><code>## [1] "dist"</code></pre> <div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>UScitiesD</span></code></pre></div> <pre><code>## Atlanta Chicago Denver Houston LosAngeles Miami NewYork ## Chicago 587 ## Denver 1212 920 ## Houston 701 940 879 ## LosAngeles 1936 1745 831 1374 ## Miami 604 1188 1726 968 2339 ## NewYork 748 713 1631 1420 2451 1092 ## SanFrancisco 2139 1858 949 1645 347 2594 2571 ## Seattle 2182 1737 1021 1891 959 2734 2408 ## Washington.DC 543 597 1494 1220 2300 923 205 ## SanFrancisco Seattle ## Chicago ## Denver ## Houston ## LosAngeles ## Miami ## NewYork ## SanFrancisco ## Seattle 678 ## Washington.DC 2442 2329</code></pre> <p>A <code>dist</code> object is most often, as in this case, lower triange matrices of distances, which can be measured in various ways. There are many packages that produce dist objects and/or take dist objects as inputs, including those for cluster analysis and multidimensional scaling.</p> <p>A simple solution to this is to follow a similar design to that for <code>lm</code> objects.</p> <div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>skim_dist <span class="ot"><-</span> <span class="cf">function</span>(.data) {</span> <span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> .data <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="fu">as.matrix</span>(.data))</span> <span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a> skimr<span class="sc">::</span><span class="fu">skim</span>(.data)</span> <span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div> <p>However, this has the limitation of treating the dist data as though it is simple numeric data.</p> <p>What we might want to do, instead, is to create a new class, for example, “distance” that is specifically for distance data. This will allow it to have its own <code>sfl</code> or skimr function list.</p> <p>As handling gets more complex, rather than make a new function it can be more powerful to define an <code>as.data.frame</code> S3 method for dist objects, which will allow it to integrate with skimr more completely and uses to use the <code>skim()</code> function directly. In a package you will want to export this.</p> <div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>as.data.frame.dist <span class="ot"><-</span> <span class="cf">function</span>(.data) {</span> <span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a> .data <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="fu">as.matrix</span>(.data))</span> <span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a></span> <span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a> .data[] <span class="ot"><-</span> <span class="fu">lapply</span>(.data, structure, <span class="at">class =</span> <span class="st">"distance"</span>, <span class="at">nms =</span> <span class="fu">names</span>(.data))</span> <span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a> .data</span> <span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a>}</span> <span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a></span> <span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="fu">as.data.frame</span>(UScitiesD)</span></code></pre></div> <pre><code>## Atlanta Chicago Denver Houston LosAngeles Miami NewYork ## Atlanta 0 587 1212 701 1936 604 748 ## Chicago 587 0 920 940 1745 1188 713 ## Denver 1212 920 0 879 831 1726 1631 ## Houston 701 940 879 0 1374 968 1420 ## LosAngeles 1936 1745 831 1374 0 2339 2451 ## Miami 604 1188 1726 968 2339 0 1092 ## NewYork 748 713 1631 1420 2451 1092 0 ## SanFrancisco 2139 1858 949 1645 347 2594 2571 ## Seattle 2182 1737 1021 1891 959 2734 2408 ## Washington.DC 543 597 1494 1220 2300 923 205 ## SanFrancisco Seattle Washington.DC ## Atlanta 2139 2182 543 ## Chicago 1858 1737 597 ## Denver 949 1021 1494 ## Houston 1645 1891 1220 ## LosAngeles 347 959 2300 ## Miami 2594 2734 923 ## NewYork 2571 2408 205 ## SanFrancisco 0 678 2442 ## Seattle 678 0 2329 ## Washington.DC 2442 2329 0</code></pre> <p>However, until an <code>sfl</code> is created, <code>skimr</code> will not recognize the class and fall back to treating the data as if it were character data.</p> <div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">skim</span>(UScitiesD)</span></code></pre></div> <pre><code>## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`. ## Warning: Couldn't find skimmers for class: distance; No user-defined `sfl` ## provided. Falling back to `character`.</code></pre> <table> <caption>Data summary</caption> <tbody> <tr class="odd"> <td align="left">Name</td> <td align="left">UScitiesD</td> </tr> <tr class="even"> <td align="left">Number of rows</td> <td align="left">10</td> </tr> <tr class="odd"> <td align="left">Number of columns</td> <td align="left">10</td> </tr> <tr class="even"> <td align="left">_______________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Column type frequency:</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">character</td> <td align="left">10</td> </tr> <tr class="odd"> <td align="left">________________________</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">Group variables</td> <td align="left">None</td> </tr> </tbody> </table> <p><strong>Variable type: character</strong></p> <table> <colgroup> <col width="19%" /> <col width="13%" /> <col width="19%" /> <col width="5%" /> <col width="5%" /> <col width="8%" /> <col width="12%" /> <col width="15%" /> </colgroup> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="right">min</th> <th align="right">max</th> <th align="right">empty</th> <th align="right">n_unique</th> <th align="right">whitespace</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">Atlanta</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="even"> <td align="left">Chicago</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="odd"> <td align="left">Denver</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="even"> <td align="left">Houston</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="odd"> <td align="left">LosAngeles</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="even"> <td align="left">Miami</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="odd"> <td align="left">NewYork</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="even"> <td align="left">SanFrancisco</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="odd"> <td align="left">Seattle</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> <tr class="even"> <td align="left">Washington.DC</td> <td align="right">0</td> <td align="right">1</td> <td align="right">1</td> <td align="right">4</td> <td align="right">0</td> <td align="right">10</td> <td align="right">0</td> </tr> </tbody> </table> <p>The solution to this is to define an <code>sfl</code> (skimr function list) specifically for the <code>distance</code> class.</p> <div id="defining-sfls-for-a-package" class="section level2"> <h2>Defining sfl’s for a package</h2> <p><code>skimr</code> has an opinionated list of functions for each class (e.g. numeric, factor) of data. The core package supports many commonly used classes, but there are many others. You can investigate these defaults by calling <code>get_default_skimmer_names()</code>.</p> <p>What if your data type, like <code>distance</code>, isn’t covered by defaults? <code>skimr</code> usually falls back to treating the type as a character, which isn’t necessarily helpful. In this case, you’re best off adding your data type with <code>skim_with()</code>.</p> <p>Before we begin, we’ll be using the following custom summary statistics throughout. These functions find the nearest and furthest other location for each location.</p> <p>One thing that is important to be aware of when creating statistics functions for skimr is that skimr largely uses tibbles rather than base data frames. This means that many base operations do not work as expected.</p> <div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>get_nearest <span class="ot"><-</span> <span class="cf">function</span>(column) {</span> <span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a> closest <span class="ot"><-</span> <span class="fu">which.min</span>(column[column <span class="sc">!=</span> <span class="dv">0</span>])</span> <span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a> cities <span class="ot"><-</span> <span class="fu">attr</span>(column, <span class="st">"nms"</span>)[column <span class="sc">!=</span> <span class="dv">0</span>]</span> <span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">toString</span>(cities[closest])</span> <span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>}</span> <span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a></span> <span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>get_furthest <span class="ot"><-</span> <span class="cf">function</span>(column) {</span> <span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a> furthest <span class="ot"><-</span> <span class="fu">which.max</span>(column[column <span class="sc">!=</span> <span class="dv">0</span>])</span> <span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a> cities <span class="ot"><-</span> <span class="fu">attr</span>(column, <span class="st">"nms"</span>)[column <span class="sc">!=</span> <span class="dv">0</span>]</span> <span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">toString</span>(cities[furthest])</span> <span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div> <p>This function, like all summary functions used by <code>skimr</code> has two notable features.</p> <ul> <li>It accepts a vector as its single argument</li> <li>it returns a scalar, or in R terminology, a vector of length 1.</li> </ul> <p>There are a lot of functions that fulfill these criteria:</p> <ul> <li>Existing functions from base, stats, or other packages,</li> <li>lambda’s created using the Tidyverse-style syntax</li> <li>custom functions that have been defined in the <code>skimr</code> package</li> <li>custom functions that you have defined.</li> </ul> <p>Not fulfilling the two criteria can lead to some very confusing behavior within <code>skimr</code>. Beware! An example of this issue is the base <code>quantile()</code> function in default <code>skimr</code> percentiles are returned by using <code>quantile()</code> five times. In the case of these functions, there could be ties which would result in returning vectors that have length greater than 1. This is handled by collapsing all of the tied values into a single string.</p> <p>Notice, also, that in the case of distance data we may wish to exclude distances of 0, which indicate the distance from a place to itself. In finding the minimum our function looks only at the distance to other places.</p> <p>There are at least two ways that you might want to customize skimr handling of a special data type within a package or your own work. The first is to create a custom skimming function.</p> <div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>skim_with_dist <span class="ot"><-</span> <span class="fu">skim_with</span>(</span> <span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a> <span class="at">distance =</span> <span class="fu">sfl</span>(</span> <span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a> <span class="at">nearest =</span> get_nearest,</span> <span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a> <span class="at">furthest =</span> get_furthest</span> <span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a> )</span> <span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>)</span></code></pre></div> <pre><code>## Creating new skimming functions for the following classes: distance. ## They did not have recognized defaults. Call get_default_skimmers() for more information.</code></pre> <div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="fu">skim_with_dist</span>(UScitiesD)</span></code></pre></div> <table> <caption>Data summary</caption> <tbody> <tr class="odd"> <td align="left">Name</td> <td align="left">UScitiesD</td> </tr> <tr class="even"> <td align="left">Number of rows</td> <td align="left">10</td> </tr> <tr class="odd"> <td align="left">Number of columns</td> <td align="left">10</td> </tr> <tr class="even"> <td align="left">_______________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Column type frequency:</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">distance</td> <td align="left">10</td> </tr> <tr class="odd"> <td align="left">________________________</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">Group variables</td> <td align="left">None</td> </tr> </tbody> </table> <p><strong>Variable type: distance</strong></p> <table> <thead> <tr class="header"> <th align="left">skim_variable</th> <th align="right">n_missing</th> <th align="right">complete_rate</th> <th align="left">nearest</th> <th align="left">furthest</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">Atlanta</td> <td align="right">0</td> <td align="right">1</td> <td align="left">Washington.DC</td> <td align="left">Seattle</td> </tr> <tr class="even"> <td align="left">Chicago</td> <td align="right">0</td> <td align="right">1</td> <td align="left">Atlanta</td> <td align="left">SanFrancisco</td> </tr> <tr class="odd"> <td align="left">Denver</td> <td align="right">0</td> <td align="right">1</td> <td align="left">LosAngeles</td> <td align="left">Miami</td> </tr> <tr class="even"> <td align="left">Houston</td> <td align="right">0</td> <td align="right">1</td> <td align="left">Atlanta</td> <td align="left">Seattle</td> </tr> <tr class="odd"> <td align="left">LosAngeles</td> <td align="right">0</td> <td align="right">1</td> <td align="left">SanFrancisco</td> <td align="left">NewYork</td> </tr> <tr class="even"> <td align="left">Miami</td> <td align="right">0</td> <td align="right">1</td> <td align="left">Atlanta</td> <td align="left">Seattle</td> </tr> <tr class="odd"> <td align="left">NewYork</td> <td align="right">0</td> <td align="right">1</td> <td align="left">Washington.DC</td> <td align="left">SanFrancisco</td> </tr> <tr class="even"> <td align="left">SanFrancisco</td> <td align="right">0</td> <td align="right">1</td> <td align="left">LosAngeles</td> <td align="left">Miami</td> </tr> <tr class="odd"> <td align="left">Seattle</td> <td align="right">0</td> <td align="right">1</td> <td align="left">SanFrancisco</td> <td align="left">Miami</td> </tr> <tr class="even"> <td align="left">Washington.DC</td> <td align="right">0</td> <td align="right">1</td> <td align="left">NewYork</td> <td align="left">SanFrancisco</td> </tr> </tbody> </table> <p>The example above creates a new <em>function</em>, and you can call that function on a specific column with <code>distance</code> data to get the appropriate summary statistics. The <code>skim_with</code> factory also uses the default skimrs for things like factors, characters, and numerics. Therefore our <code>skim_with_dist</code> is like the regular <code>skim</code> function with the added ability to summarize <code>distance</code> columns.</p> <p>While this works for any data type and you can also include it within any package (assuming your users load skimr), there is a second, even better, approach. To take full advantage of <code>skimr</code>, we’ll dig a bit into its API.</p> </div> <div id="adding-new-methods" class="section level2"> <h2>Adding new methods</h2> <p><code>skimr</code> has a lookup mechanism, based on the function <code>get_skimmers()</code>, to find default summary functions for each class. This is based on the S3 class system. You can learn more about it in <a href="https://adv-r.hadley.nz/s3.html"><em>Advanced R</em></a>.</p> <p>This requires that you add <code>skimr</code> to your list of dependencies.</p> <p>To export a new set of defaults for a data type, create a method for the generic function <code>get_skimmers</code>. Each of those methods returns an <code>sfl</code> (skimr function list) This is the same list-like data structure used in the <code>skim_with()</code> example above. But note! There is one key difference. When adding a generic we also want to identify the <code>skim_type</code> in the <code>sfl</code>. You will probably want to use <code>skimr::get_skimmers.distance()</code> but that will not work in a vignette.</p> <p>In a package you will want to export this.</p> <div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co">#' @importFrom skimr get_skimmers</span></span> <span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a><span class="co">#' @export</span></span> <span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>get_skimmers.distance <span class="ot"><-</span> <span class="cf">function</span>(column) {</span> <span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">sfl</span>(</span> <span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a> <span class="at">skim_type =</span> <span class="st">"distance"</span>,</span> <span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a> <span class="at">nearest =</span> get_nearest,</span> <span id="cb24-7"><a href="#cb24-7" aria-hidden="true" tabindex="-1"></a> <span class="at">furthest =</span> get_furthest</span> <span id="cb24-8"><a href="#cb24-8" aria-hidden="true" tabindex="-1"></a> )</span> <span id="cb24-9"><a href="#cb24-9" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div> <p>The same strategy follows for other data types.</p> <ul> <li>Create a method</li> <li>return an <code>sfl</code></li> <li>make sure that the <code>skim_type</code> is included.</li> </ul> <p>Users of your package should load <code>skimr</code> to get the <code>skim()</code> function (although you could import and reexport it). Once loaded, a call to <code>get_default_skimmer_names()</code> will return defaults for your data types as well!</p> <div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="fu">get_default_skimmer_names</span>()</span></code></pre></div> <pre><code>## $AsIs ## [1] "n_unique" "min_length" "max_length" ## ## $Date ## [1] "min" "max" "median" "n_unique" ## ## $POSIXct ## [1] "min" "max" "median" "n_unique" ## ## $Timespan ## [1] "min" "max" "median" "n_unique" ## ## $character ## [1] "min" "max" "empty" "n_unique" "whitespace" ## ## $complex ## [1] "mean" ## ## $difftime ## [1] "min" "max" "median" "n_unique" ## ## $distance ## [1] "nearest" "furthest" ## ## $factor ## [1] "ordered" "n_unique" "top_counts" ## ## $haven_labelled ## [1] "mean" "sd" "p0" "p25" "p50" "p75" "p100" "hist" ## ## $list ## [1] "n_unique" "min_length" "max_length" ## ## $logical ## [1] "mean" "count" ## ## $numeric ## [1] "mean" "sd" "p0" "p25" "p50" "p75" "p100" "hist" ## ## $ts ## [1] "start" "end" "frequency" "deltat" "mean" ## [6] "sd" "min" "max" "median" "line_graph"</code></pre> <p>They will then be able to use <code>skim()</code> directly.</p> <div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="fu">skim</span>(UScitiesD)</span></code></pre></div> <table> <caption>Data summary</caption> <tbody> <tr class="odd"> <td align="left">Name</td> <td align="left">UScitiesD</td> </tr> <tr class="even"> <td align="left">Number of rows</td> <td align="left">10</td> </tr> <tr class="odd"> <td align="left">Number of columns</td> <td align="left">10</td> </tr> <tr class="even"> <td align="left">_______________________</td> <td align="left"></td> </tr> <tr class="odd"> <td align="left">Column type frequency:</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">distance</td> <td align="left">10</td> </tr> <tr class="odd"> <td align="left">________________________</td> <td align="left"></td> </tr> <tr class="even"> <td align="left">Group variables</td> <td align="left">None</td> </tr> </tbody> </table> <p><strong>Variable ty