qminer
Version:
A C++ based data analytics platform for processing large-scale real-time streams containing structured and unstructured data
305 lines (301 loc) • 12.2 kB
HTML
<html>
<head>
<meta name="generator" content="JSDoc 3">
<meta charset="utf-8">
<title>Class: Tokenizer</title>
<link rel="stylesheet" href="https://brick.a.ssl.fastly.net/Karla:400,400i,700,700i" type="text/css">
<link rel="stylesheet" href="https://brick.a.ssl.fastly.net/Noto+Serif:400,400i,700,700i" type="text/css">
<link rel="stylesheet" href="https://brick.a.ssl.fastly.net/Inconsolata:500" type="text/css">
<link href="css/baseline.css" rel="stylesheet">
</head>
<body onload="prettyPrint()">
<nav id="jsdoc-navbar" role="navigation" class="jsdoc-navbar">
<div id="jsdoc-navbar-container">
<div id="jsdoc-navbar-content">
<a href="index.html" class="jsdoc-navbar-package-name">QMiner JavaScript API v9.4.0</a>
</div>
</div>
</nav>
<div id="jsdoc-body-container">
<div id="jsdoc-content">
<div id="jsdoc-content-container">
<div id="jsdoc-main" role="main">
<header class="page-header">
<div class="symbol-detail-labels"><span class="label label-kind">class</span> <span class="label label-static">static</span></div>
<h1><small><a href="module-analytics.html">analytics</a>.<wbr></small><span class="symbol-name">Tokenizer</span></h1>
<p class="source-link">Source: <a href="analyticsdoc.js.html#source-line-1504">analyticsdoc.<wbr>js:1504</a></p>
<div class="symbol-classdesc">
<p>Breaks text into tokens (i.e. words).</p>
</div>
<dl class="dl-compact">
</dl>
</header>
<section id="summary">
<div class="summary-callout">
<h2 class="summary-callout-heading">Methods</h2>
<div class="summary-content">
<div class="summary-column">
<dl class="dl-summary-callout">
<dt><a href="module-analytics.Tokenizer.html#getParagraphs">getParagraphs(str)</a></dt>
<dd>
</dd>
</dl>
</div>
<div class="summary-column">
<dl class="dl-summary-callout">
<dt><a href="module-analytics.Tokenizer.html#getSentences">getSentences(str)</a></dt>
<dd>
</dd>
</dl>
</div>
<div class="summary-column">
<dl class="dl-summary-callout">
<dt><a href="module-analytics.Tokenizer.html#getTokens">getTokens(str)</a></dt>
<dd>
</dd>
</dl>
</div>
</div>
</div>
</section>
<section>
<h2 id="Tokenizer">new <span class="symbol-name">Tokenizer</span><span class="signature"><span class="signature-params">([arg])</span></span></h2>
<p>Tokenizer</p>
<section>
<h3>
Example
</h3>
<div>
<pre class="prettyprint"><code>// import analytics module
var analytics = require('qminer').analytics;
// construct Tokenizer object
var tokenizer = new analytics.Tokenizer({ type: "simple" });</code></pre>
</div>
</section>
<section>
<h3>Parameter</h3>
<table class="jsdoc-details-table">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Optional</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<p>arg</p>
</td>
<td>
<p><a href="module-analytics.html#~tokenizerParam">module:analytics~tokenizerParam</a></p>
</td>
<td>
<p>Yes</p>
</td>
<td>
<p>Construction arguments. If arg is not given it uses the <code>'unicode'</code> tokenizer type.</p>
</td>
</tr>
</tbody>
</table>
</section>
<dl class="dl-compact">
</dl>
</section>
<section>
<h2>Methods</h2>
<section>
<h3 id="getParagraphs"><span class="symbol-name">getParagraphs</span><span class="signature"><span class="signature-params">(str)</span> → <span class="signature-returns"> Array of String</span></span></h3>
<p>Breaks string into paragraphs.</p>
<section>
<h4>
Example
</h4>
<div>
<pre class="prettyprint"><code>// import modules
var analytics = require('qminer').analytics;
var la = require('qminer').la;
// construct model
var tokenizer = new analytics.Tokenizer();
// string you wish to tokenize
var string = "Yes!\t No?\n Maybe...";
// tokenize text using getParagraphs
var tokens = tokenizer.getParagraphs(string);
// output:
tokens = ["Yes", " No", " Maybe"];</code></pre>
</div>
</section>
<section>
<h4>Parameter</h4>
<table class="jsdoc-details-table">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Optional</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<p>str</p>
</td>
<td>
<p>String</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>String given to break into paragraphs.</p>
</td>
</tr>
</tbody>
</table>
</section>
<dl class="dl-compact">
<dt>Returns</dt>
<dd>
<p><code>Array of String</code>B Array of paragraphs. The number of paragraphs is equal to number of paragraphs in input <code>str</code>.
When function detects escape sequences <code>'\n'</code>, <code>'\r'</code> or <code>'\t'</code> it breaks text as new paragraph.</p>
</dd>
</dl>
<h3 id="getSentences"><span class="symbol-name">getSentences</span><span class="signature"><span class="signature-params">(str)</span> → <span class="signature-returns"> Array of String</span></span></h3>
<p>Breaks string into sentences.</p>
<section>
<h4>
Example
</h4>
<div>
<pre class="prettyprint"><code>// import modules
var analytics = require('qminer').analytics;
var la = require('qminer').la;
// construct model
var tokenizer = new analytics.Tokenizer();
// string you wish to tokenize
var string = "C++? Alright. Let's do this!";
// tokenize text using getSentences
var tokens = tokenizer.getSentences(string);
// output:
tokens = ["C++", " Alright", " Let's do this"];</code></pre>
</div>
</section>
<section>
<h4>Parameter</h4>
<table class="jsdoc-details-table">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Optional</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<p>str</p>
</td>
<td>
<p>String</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>String given to break into sentences.</p>
</td>
</tr>
</tbody>
</table>
</section>
<dl class="dl-compact">
<dt>Returns</dt>
<dd>
<p><code>Array of String</code>B Array of sentences. The number of sentences is equal to number of sentences in input <code>str</code>.
How function breaks sentences depends on where you use a full-stop, exclamation mark, question mark or the new line command.
Careful: the space between the lines is not ignored.</p>
</dd>
</dl>
<h3 id="getTokens"><span class="symbol-name">getTokens</span><span class="signature"><span class="signature-params">(str)</span> → <span class="signature-returns"> Array of String</span></span></h3>
<p>Tokenizes given string.</p>
<section>
<h4>
Example
</h4>
<div>
<pre class="prettyprint"><code>// import modules
var analytics = require('qminer').analytics;
var la = require('qminer').la;
// construct model
var tokenizer = new analytics.Tokenizer();
// string you wish to tokenize
var string = "What a beautiful day!";
// tokenize string using getTokens
var tokens = tokenizer.getTokens(string);
// output:
tokens = ["What", "a", "beautiful", "day"];</code></pre>
</div>
</section>
<section>
<h4>Parameter</h4>
<table class="jsdoc-details-table">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Optional</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<p>str</p>
</td>
<td>
<p>String</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>String given to tokenize.</p>
</td>
</tr>
</tbody>
</table>
</section>
<dl class="dl-compact">
<dt>Returns</dt>
<dd>
<p><code>Array of String</code>B Array of tokens. The number of tokens is equal to number of words in input <code>str</code>.
Only keeps words, skips all punctuation.
Tokenizing contractions (i.e. don't) depends on which type you use. Example: type <code>'html'</code> breaks contractions into 2 tokens.</p>
</dd>
</dl>
</section>
</section>
</div>
</div>
<nav id="jsdoc-toc-nav" role="navigation"></nav>
</div>
</div>
<footer id="jsdoc-footer" class="jsdoc-footer">
<div id="jsdoc-footer-container">
<p>
</p>
</div>
</footer>
<script src="scripts/jquery.min.js"></script>
<script src="scripts/tree.jquery.js"></script>
<script src="scripts/prettify.js"></script>
<script src="scripts/jsdoc-toc.js"></script>
<script src="scripts/linenumber.js"></script>
<script src="scripts/scrollanchor.js"></script>
</body>
</html>