longestrepeatedstrings
Version:
Finds duplicated text strings and generates a report about the longest substrings or most frequent words in supplied text
171 lines (170 loc) • 11.5 kB
HTML
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Longest Repeated Strings</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Winky+Sans:ital,wght@0,300..900;1,300..900&display=swap" rel="stylesheet">
<link rel="apple-touch-icon" sizes="180x180" href="180.png">
<link rel="icon" type="image/png" sizes="32x32" href="32.png">
<link rel="icon" type="image/png" sizes="16x16" href="16.png">
<link rel="manifest" href="site.webmanifest">
</head>
<body>
<div class="container">
<h1>Longest Repeated Strings</h1>
<p class="intro">Finds <em>duplicated text</em> strings and generates a report about the <em>longest substrings</em> or <em>most frequent words</em> in supplied text, weighted by how much space the string takes up overall (length * occurences).</p>
<p>Paste text or select files to analyze!</p>
<textarea id="textInput" placeholder="Enter text here..."></textarea>
<div id="upload"><label><span>📁</span><input type="file" id="fileInput" multiple></label></div>
<div class="opts">
<h2>Options</h2>
<label title="The minimum length of substrings to consider." class="hr">Min match length: <input type="number" id="minLen" value="4"></label>
<label title="The maximum length of substrings to consider." class="hr">Max match length: <input type="number" id="maxLen" value="40"></label>
<br>
<label title="The minimum number of occurrences a substring must have to be included." class="hr">Min occurrences: <input type="number" id="minOcc" value="2"></label>
<label title="The maximum length of substrings to consider." class="hr">Max results: <input type="number" id="maxRes" value="50"></label>
<br>
<label title="Helps order results." class="fr">Per-occurence score penalty: <input type="number" id="penalty" value="0"></label>
<br>
<h4>Input splitting</h4>
<h5>(Splitting helps speed up processing)</h5>
<div class="split">
<label style="font-size: 1.2em;">Split input after (chars are retained):</label>
<br>
<label><input type="checkbox" name="split" value=" " checked><span class="char"> </span>(space)</label>
<label><input type="checkbox" name="split" value="\n" checked><span class="char">\n</span>(newline)</label>
<label><input type="checkbox" name="split" value="\t" checked><span class="char">\t</span>(tab)</label>
<label><input type="checkbox" name="split" value=":" checked><span class="char">:</span>(colon)</label>
<label><input type="checkbox" name="split" value=";" checked><span class="char">;</span>(semi-colon)</label>
<label><input type="checkbox" name="split" value="." checked><span class="char">.</span>(full-stop)</label>
<label><input type="checkbox" name="split" value="," checked><span class="char">,</span>(comma)</label>
<label><input type="checkbox" name="split" value='"'><span class="char">"</span>(double-quote)</label>
<label><input type="checkbox" name="split" value="'"><span class="char">'</span>(single-quote)</label>
<label><input type="checkbox" name="split" value="{"><span class="char">{</span>(open brace)</label>
<label><input type="checkbox" name="split" value="}"><span class="char">}</span>(close brace)</label>
<label><input type="checkbox" name="split" value="("><span class="char">(</span>(open paren)</label>
<label><input type="checkbox" name="split" value=")"><span class="char">)</span>(close paren)</label>
<label><input type="checkbox" name="split" value="["><span class="char">[</span>(open bracket)</label>
<label><input type="checkbox" name="split" value="]"><span class="char">]</span>(close bracket)</label>
<label><input type="checkbox" name="split" value="="><span class="char">=</span>(equals)</label>
</div>
<div class="break">
<label style="font-size: 1.2em;" title="Add your own" class="cr">Custom split input after (char is retained): <input type="text" size="2" id="split-custom" value=""></label>
<label style="font-size: 1.2em;" title="Can be used to concatenate texts." class="cr">Split input by (char is discarded): <input type="text" size="2" id="break" value=""></label>
</div>
<br>
<h4>Processing</h4>
<label title="Matches on whole words."><input type="checkbox" id="words" checked> Whole words</label>
<label title="Symbols won't appear in matches"><input type="checkbox" id="clean" checked> Strip symbols</label>
<label title="Trims white space from results."><input type="checkbox" id="trim" checked> Trim white space</label>
<br>
<label title="Substrings to omit from the results. Can be used to ignore accepted long/frequent words.">Words to omit (one per line):</label>
<br>
<textarea id="omit" style="height: 40px; field-sizing: content; width: 50%;"></textarea>
<br>
<h4>Report</h4>
<label title="Character/s to insert between each result." class="cr">Delimiter: <textarea rows="1" id="delim" rows="1" cols="2">★</textarea></label>
<label title="Character/s to insert before the repeat count." class="cr">Open paren: <input type="text" id="open" size="2" value="⦅"></label>
<label title="Character/s to insert after the repeat count." class="cr">Close paren: <input type="text" id="close" size="2" value="×⦆"></label>
</div>
<button class="analyze" onclick="analyze()">Analyze</button>
<h3>Results:</h3>
<div class="results" id="results"></div>
</div>
<footer>Created Mar 2025 by D.A. Braksator. <a href="https://github.com/braksator/LongestRepeatedStrings">Github repo</a></footer>
<style>
html, body { min-height: 100%; margin: 0; }
body { font-family: Futura, "Trebuchet MS", Arial, sans-serif; text-align: center; padding: 1em; background: repeating-linear-gradient(-45deg,#efefef,#efefef 5px,#fefefe 6px,#efefef 20px); }
h1, h2, h4 { font-size: 32px; font-family: "Winky Sans", sans-serif; font-weight: bold; }
h2 { font-size: 24px; margin: 0 0 .5em 0; }
h5 { margin: .25em 0; }
.container { max-width: 960px; margin: auto; background: white; padding: 20px; border-radius: 10px; box-shadow: 0px 0px 10px #aaa; }
textarea { width: 100%; height: 100px; max-width: 100%; }
input, button { margin: 5px; padding: .25em; }
button { font-size: 24px; }
input[type=number] { width: 50px; }
label { font-variant: small-caps; line-height: 2em; white-space: pre; }
.hr { width: 25%; display: inline-block; text-align: right; min-width: 220px; }
.results { text-align: left; padding: 27px 1em 52px 1em; white-space: pre-wrap; line-height: 27px; background: repeating-linear-gradient(#fbfbb2, #fbfbb2 25px, #9abd6e 26px, #9abd6e 27px); }
h3 { font-size: 14px; text-align: left; background: #fbfbb2; position: relative; font-variant: small-caps; margin: 20px 0 0 0; padding: 2em 1.4em; user-select: none; color: #444; border-bottom: 3px double #d9ac68; }
footer { margin-top: 20px; font-size: 14px; color: #555; background: #fff; max-width: 960px; padding: .2em; margin: 1em auto; border-radius: 3px; opacity: .6 }
.opts { background-color: #f2f2f2; padding: 1em; margin: 1em; border-radius: 10px }
a { color: #333; }
.analyze { padding: .25em 2em; color: #fff; background: #369c17; border: 0; border-radius: 5px; &:hover { background: #40aa20;} }
.intro { opacity: .6; font-size: 2.2em; font-family: Garamond, Baskerville, Baskerville Old Face, Hoefler Text, Times New Roman, serif; transform: scale(1.2) scaleX(.8); margin: 2em 0; }
.cr { font-size: .8em; input { font-size: .8em; }}
#upload { font-size: 24px; margin-top: .5em;
label {
border: 3px dashed rgb(255, 165, 0, .6); border-radius: 10px; padding: 24px 0 24px 24px; background: #f9fbff; cursor: pointer; span { font-size: 40px; position: relative; top: 4px; }
&:hover { background: #f0f4fc; input[type=file]::file-selector-button { background: #fafafa; } }
}
}
input[type=file] { font-size: 24px; cursor: pointer; text-align: center; margin: 0 auto; margin-left: .5em;
&::file-selector-button { margin: 1em auto; margin-right: 1em; cursor: pointer; padding: .25em 2em; border-radius: 5px; border-width: 0; background: #f2f2f2; }
}
#delim { height:1em; width:2.4em; resize: none; padding: 0.25em; vertical-align: middle; font-size: 1.3em; line-height: 1em;}
#delim::-webkit-scrollbar { display: none; }
h4 { margin: 0.25em 0; font-size: 16px; }
.char { padding: 0 .25em; background: #ccc; margin-right: 3px; }
.split { text-align: left; font-size: .8em; }
.break { font-size: .8em; }
</style>
<script>
function getChars() {
return {
delim: document.getElementById('delim').value,
open: document.getElementById('open').value,
close: document.getElementById('close').value,
};
}
function getOpts() {
let split = [...document.querySelectorAll('input[name="split"]:checked')].map(cb => cb.value);
document.getElementById('split-custom').value && split.push(document.getElementById('split-custom').value);
return {
minLen: +document.getElementById('minLen').value,
maxLen: +document.getElementById('maxLen').value,
minOcc: +document.getElementById('minOcc').value,
maxRes: +document.getElementById('maxRes').value,
penalty: +document.getElementById('penalty').value,
omit: document.getElementById('omit').value.split('\n').map(o => o.trim()).filter(Boolean),
trim: document.getElementById('trim').checked,
clean: document.getElementById('clean').checked,
words: document.getElementById('words').checked,
break: document.getElementById('break').value ? [document.getElementById('break').value] : [],
split: split,
};
}
function analyzeText() {
let text = document.getElementById('textInput').value, results = document.getElementById('results');
results.textContent = text ? '⌛ Processing...\r\n' : '⚠️ No input text supplied!';
setTimeout(() => {
if (text) {
let res = lrs.textReport(lrs.text(text, getOpts()), 0, getChars());
results.textContent = res ? res : '📄 No results.';
}
}, 1);
}
function analyze() {
let files = document.getElementById('fileInput').files, results = document.getElementById('results');
results.textContent = '⌛ Processing...\r\n';
setTimeout(() => {
if (!files.length) return analyzeText();
[...files].forEach(file => {
let reader = new FileReader();
reader.onload = () => {
let res = lrs.textReport(lrs.text(reader.result, getOpts()), 0, getChars());
results.textContent += `📄 Analysis of repeated strings in "${file.name}": ${res ? res : 'No results.'} \r\n`;
};
reader.readAsText(file);
});
}, 1);
}
function require() { return 0; }
module = {};
</script>
<script src="index.js"></script>
</body>
</html>