UNPKG

longestrepeatedstrings

Version:

Finds duplicated text strings and generates a report about the longest substrings or most frequent words in supplied text

171 lines (170 loc) 11.5 kB
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Longest Repeated Strings</title> <link rel="preconnect" href="https://fonts.googleapis.com"> <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> <link href="https://fonts.googleapis.com/css2?family=Winky+Sans:ital,wght@0,300..900;1,300..900&display=swap" rel="stylesheet"> <link rel="apple-touch-icon" sizes="180x180" href="180.png"> <link rel="icon" type="image/png" sizes="32x32" href="32.png"> <link rel="icon" type="image/png" sizes="16x16" href="16.png"> <link rel="manifest" href="site.webmanifest"> </head> <body> <div class="container"> <h1>Longest Repeated Strings</h1> <p class="intro">Finds <em>duplicated text</em> strings and generates a report about the <em>longest substrings</em> or <em>most frequent words</em> in supplied text, weighted by how much space the string takes up overall (length * occurences).</p> <p>Paste text or select files to analyze!</p> <textarea id="textInput" placeholder="Enter text here..."></textarea> <div id="upload"><label><span>📁</span><input type="file" id="fileInput" multiple></label></div> <div class="opts"> <h2>Options</h2> <label title="The minimum length of substrings to consider." class="hr">Min match length: <input type="number" id="minLen" value="4"></label> <label title="The maximum length of substrings to consider." class="hr">Max match length: <input type="number" id="maxLen" value="40"></label> <br> <label title="The minimum number of occurrences a substring must have to be included." class="hr">Min occurrences: <input type="number" id="minOcc" value="2"></label> <label title="The maximum length of substrings to consider." class="hr">Max results: <input type="number" id="maxRes" value="50"></label> <br> <label title="Helps order results." class="fr">Per-occurence score penalty: <input type="number" id="penalty" value="0"></label> <br> <h4>Input splitting</h4> <h5>(Splitting helps speed up processing)</h5> <div class="split"> <label style="font-size: 1.2em;">Split input after (chars are retained):</label> <br> <label><input type="checkbox" name="split" value=" " checked><span class="char"> </span>(space)</label> <label><input type="checkbox" name="split" value="\n" checked><span class="char">\n</span>(newline)</label> <label><input type="checkbox" name="split" value="\t" checked><span class="char">\t</span>(tab)</label> <label><input type="checkbox" name="split" value=":" checked><span class="char">:</span>(colon)</label> <label><input type="checkbox" name="split" value=";" checked><span class="char">;</span>(semi-colon)</label> <label><input type="checkbox" name="split" value="." checked><span class="char">.</span>(full-stop)</label> <label><input type="checkbox" name="split" value="," checked><span class="char">,</span>(comma)</label> <label><input type="checkbox" name="split" value='"'><span class="char">"</span>(double-quote)</label> <label><input type="checkbox" name="split" value="'"><span class="char">'</span>(single-quote)</label> <label><input type="checkbox" name="split" value="{"><span class="char">{</span>(open brace)</label> <label><input type="checkbox" name="split" value="}"><span class="char">}</span>(close brace)</label> <label><input type="checkbox" name="split" value="("><span class="char">(</span>(open paren)</label> <label><input type="checkbox" name="split" value=")"><span class="char">)</span>(close paren)</label> <label><input type="checkbox" name="split" value="["><span class="char">[</span>(open bracket)</label> <label><input type="checkbox" name="split" value="]"><span class="char">]</span>(close bracket)</label> <label><input type="checkbox" name="split" value="="><span class="char">=</span>(equals)</label> </div> <div class="break"> <label style="font-size: 1.2em;" title="Add your own" class="cr">Custom split input after (char is retained): <input type="text" size="2" id="split-custom" value=""></label> <label style="font-size: 1.2em;" title="Can be used to concatenate texts." class="cr">Split input by (char is discarded): <input type="text" size="2" id="break" value=""></label> </div> <br> <h4>Processing</h4> <label title="Matches on whole words."><input type="checkbox" id="words" checked> Whole words</label> <label title="Symbols won't appear in matches"><input type="checkbox" id="clean" checked> Strip symbols</label> <label title="Trims white space from results."><input type="checkbox" id="trim" checked> Trim white space</label> <br> <label title="Substrings to omit from the results. Can be used to ignore accepted long/frequent words.">Words to omit (one per line):</label> <br> <textarea id="omit" style="height: 40px; field-sizing: content; width: 50%;"></textarea> <br> <h4>Report</h4> <label title="Character/s to insert between each result." class="cr">Delimiter: <textarea rows="1" id="delim" rows="1" cols="2"></textarea></label> <label title="Character/s to insert before the repeat count." class="cr">Open paren: <input type="text" id="open" size="2" value="⦅"></label> <label title="Character/s to insert after the repeat count." class="cr">Close paren: <input type="text" id="close" size="2" value="×⦆"></label> </div> <button class="analyze" onclick="analyze()">Analyze</button> <h3>Results:</h3> <div class="results" id="results"></div> </div> <footer>Created Mar 2025 by D.A. Braksator. <a href="https://github.com/braksator/LongestRepeatedStrings">Github repo</a></footer> <style> html, body { min-height: 100%; margin: 0; } body { font-family: Futura, "Trebuchet MS", Arial, sans-serif; text-align: center; padding: 1em; background: repeating-linear-gradient(-45deg,#efefef,#efefef 5px,#fefefe 6px,#efefef 20px); } h1, h2, h4 { font-size: 32px; font-family: "Winky Sans", sans-serif; font-weight: bold; } h2 { font-size: 24px; margin: 0 0 .5em 0; } h5 { margin: .25em 0; } .container { max-width: 960px; margin: auto; background: white; padding: 20px; border-radius: 10px; box-shadow: 0px 0px 10px #aaa; } textarea { width: 100%; height: 100px; max-width: 100%; } input, button { margin: 5px; padding: .25em; } button { font-size: 24px; } input[type=number] { width: 50px; } label { font-variant: small-caps; line-height: 2em; white-space: pre; } .hr { width: 25%; display: inline-block; text-align: right; min-width: 220px; } .results { text-align: left; padding: 27px 1em 52px 1em; white-space: pre-wrap; line-height: 27px; background: repeating-linear-gradient(#fbfbb2, #fbfbb2 25px, #9abd6e 26px, #9abd6e 27px); } h3 { font-size: 14px; text-align: left; background: #fbfbb2; position: relative; font-variant: small-caps; margin: 20px 0 0 0; padding: 2em 1.4em; user-select: none; color: #444; border-bottom: 3px double #d9ac68; } footer { margin-top: 20px; font-size: 14px; color: #555; background: #fff; max-width: 960px; padding: .2em; margin: 1em auto; border-radius: 3px; opacity: .6 } .opts { background-color: #f2f2f2; padding: 1em; margin: 1em; border-radius: 10px } a { color: #333; } .analyze { padding: .25em 2em; color: #fff; background: #369c17; border: 0; border-radius: 5px; &:hover { background: #40aa20;} } .intro { opacity: .6; font-size: 2.2em; font-family: Garamond, Baskerville, Baskerville Old Face, Hoefler Text, Times New Roman, serif; transform: scale(1.2) scaleX(.8); margin: 2em 0; } .cr { font-size: .8em; input { font-size: .8em; }} #upload { font-size: 24px; margin-top: .5em; label { border: 3px dashed rgb(255, 165, 0, .6); border-radius: 10px; padding: 24px 0 24px 24px; background: #f9fbff; cursor: pointer; span { font-size: 40px; position: relative; top: 4px; } &:hover { background: #f0f4fc; input[type=file]::file-selector-button { background: #fafafa; } } } } input[type=file] { font-size: 24px; cursor: pointer; text-align: center; margin: 0 auto; margin-left: .5em; &::file-selector-button { margin: 1em auto; margin-right: 1em; cursor: pointer; padding: .25em 2em; border-radius: 5px; border-width: 0; background: #f2f2f2; } } #delim { height:1em; width:2.4em; resize: none; padding: 0.25em; vertical-align: middle; font-size: 1.3em; line-height: 1em;} #delim::-webkit-scrollbar { display: none; } h4 { margin: 0.25em 0; font-size: 16px; } .char { padding: 0 .25em; background: #ccc; margin-right: 3px; } .split { text-align: left; font-size: .8em; } .break { font-size: .8em; } </style> <script> function getChars() { return { delim: document.getElementById('delim').value, open: document.getElementById('open').value, close: document.getElementById('close').value, }; } function getOpts() { let split = [...document.querySelectorAll('input[name="split"]:checked')].map(cb => cb.value); document.getElementById('split-custom').value && split.push(document.getElementById('split-custom').value); return { minLen: +document.getElementById('minLen').value, maxLen: +document.getElementById('maxLen').value, minOcc: +document.getElementById('minOcc').value, maxRes: +document.getElementById('maxRes').value, penalty: +document.getElementById('penalty').value, omit: document.getElementById('omit').value.split('\n').map(o => o.trim()).filter(Boolean), trim: document.getElementById('trim').checked, clean: document.getElementById('clean').checked, words: document.getElementById('words').checked, break: document.getElementById('break').value ? [document.getElementById('break').value] : [], split: split, }; } function analyzeText() { let text = document.getElementById('textInput').value, results = document.getElementById('results'); results.textContent = text ? '⌛ Processing...\r\n' : '⚠️ No input text supplied!'; setTimeout(() => { if (text) { let res = lrs.textReport(lrs.text(text, getOpts()), 0, getChars()); results.textContent = res ? res : '📄 No results.'; } }, 1); } function analyze() { let files = document.getElementById('fileInput').files, results = document.getElementById('results'); results.textContent = '⌛ Processing...\r\n'; setTimeout(() => { if (!files.length) return analyzeText(); [...files].forEach(file => { let reader = new FileReader(); reader.onload = () => { let res = lrs.textReport(lrs.text(reader.result, getOpts()), 0, getChars()); results.textContent += `📄 Analysis of repeated strings in "${file.name}": ${res ? res : 'No results.'} \r\n`; }; reader.readAsText(file); }); }, 1); } function require() { return 0; } module = {}; </script> <script src="index.js"></script> </body> </html>