@flatfile/plugin-xlsx-extractor
Version:
A plugin for parsing xlsx files in Flatfile.
8 lines (6 loc) • 5.97 kB
JavaScript
import { Extractor } from '@flatfile/util-extractor';
import * as i from 'xlsx';
function G(t){let e=t.s.r===t.e.r,s=t.s.c===t.e.c;return e&&!s?"acrossColumns":!e&&s?"acrossRows":"acrossRanges"}function k(t,e){if(!t["!merges"]||!e?.mergedCellOptions)return t;let s=JSON.parse(JSON.stringify(t)),o=[...t["!merges"]||[]];return X(s,o,e,["applyToAll","applyToTopLeft"]),X(s,o,e,["concatenate"]),X(s,o,e,["coalesce"]),s["!merges"]=[],s}function X(t,e,s,o){O(t,e,s,"acrossRanges",o),O(t,e,s,"acrossRows",o),O(t,e,s,"acrossColumns",o);}function O(t,e,s,o,r){let n=s.mergedCellOptions?.[o];if(!n||!r.includes(n.treatment))return;let a=e.filter(l=>G(l)===o);for(let l of a){let c=n.treatment,u=i.utils.encode_cell({r:l.s.r,c:l.s.c}),p=t[u];p&&(c==="applyToAll"?$(t,l,p):c==="applyToTopLeft"?F(t,l):c==="coalesce"&&(o==="acrossRows"||o==="acrossColumns")?I(t,l,o):c==="concatenate"&&(o==="acrossRows"||o==="acrossColumns")&&B(t,l,o,n.separator||","));}}function $(t,e,s){for(let o=e.s.r;o<=e.e.r;o++)for(let r=e.s.c;r<=e.e.c;r++){let n=i.utils.encode_cell({r:o,c:r});t[n]={...s};}}function F(t,e){for(let s=e.s.r;s<=e.e.r;s++)for(let o=e.s.c;o<=e.e.c;o++)if(s!==e.s.r||o!==e.s.c){let r=i.utils.encode_cell({r:s,c:o});t[r]={t:"s",v:""};}}function I(t,e,s){if(s==="acrossRows")for(let o=e.s.r+1;o<=e.e.r;o++)for(let r=e.s.c;r<=e.e.c;r++){let n=i.utils.encode_cell({r:o,c:r});delete t[n];}else if(s==="acrossColumns")for(let o=e.s.r;o<=e.e.r;o++)for(let r=e.s.c+1;r<=e.e.c;r++){let n=i.utils.encode_cell({r:o,c:r});delete t[n];}}function B(t,e,s,o){if(s==="acrossRows")for(let r=e.s.c;r<=e.e.c;r++){let n=[];for(let a=e.s.r;a<=e.e.r;a++){let l=i.utils.encode_cell({r:a,c:r}),c=t[l];c&&c.v!==null&&c.v!==void 0&&c.v!==""&&n.push(String(c.v));}if(n.length>0){let a=i.utils.encode_cell({r:e.s.r,c:r});t[a]={t:"s",v:n.join(o),w:n.join(o),h:n.join(o),r:`<t>${n.join(o)}</t>`};for(let l=e.s.r+1;l<=e.e.r;l++){let c=i.utils.encode_cell({r:l,c:r});delete t[c];}}}else if(s==="acrossColumns")for(let r=e.s.r;r<=e.e.r;r++){let n=[];for(let a=e.s.c;a<=e.e.c;a++){let l=i.utils.encode_cell({r,c:a}),c=t[l];c&&c.v!==null&&c.v!==void 0&&c.v!==""&&n.push(String(c.v));}if(n.length>0){let a=i.utils.encode_cell({r,c:e.s.c});t[a]={t:"s",v:n.join(o),w:n.join(o),h:n.join(o),r:`<t>${n.join(o)}</t>`};for(let l=e.s.c+1;l<=e.e.c;l++){let c=i.utils.encode_cell({r,c:l});delete t[c];}}}}function A(t){let e=new Map;return t.map(s=>{let o=(s||"empty").replace("*",""),r=e.get(o)||0;return e.set(o,r+1),r?`${o}_${r}`:o})}var d=t=>t===null||typeof t=="string"&&t.trim()==="",N=t=>{let e=0;for(let s=0;s<t.length;s++)d(t[s])||(e=s);return t.slice(0,e+1)},D=t=>{if(!t||t.length===0)return t;let e=[...t],s=Math.max(...t.map(o=>o.length));for(let o=0;o<s;o++){let r=null;for(let n=0;n<e.length;n++){if(e[n].every(d)){r=null;continue}d(e[n][o])&&r!==null?e[n][o]=r:d(e[n][o])||(r=e[n][o]);}}return e},q=t=>{if(!t||t.length===0)return !1;t.filter(r=>typeof r=="number"||typeof r=="string"&&!isNaN(Number(r))&&r.trim()!=="").length;let s=t.filter(r=>typeof r=="string"&&r.trim()!==""&&isNaN(Number(r))).length,o=t.filter(r=>!d(r)).length;return o<2?!1:o>0&&s/o>=.8},_=t=>{if(!t||t.length===0)return t;let e=t,s=[...e],o=e.filter(q);if(o.length===0)return e;for(let r=0;r<o.length;r++){let n=null,a=o[r];for(let l=0;l<a.length;l++){if(d(a[l])&&n!==null){let u=e.findIndex(p=>p===a);u>=0&&(s[u][l]=n);}else d(a[l])||(n=a[l]);e.every(u=>l>=u.length||d(u[l]))&&(n=null);}}return s};async function L(t,e){let s;try{s=i.read(t,{type:"buffer",cellDates:!0,dense:!0,dateNF:e?.dateNF||void 0,WTF:!0});}catch(r){if(r.code==="ERR_STRING_TOO_LONG")throw e?.debug&&console.log("File is too large to parse. Try converting this file to CSV."),new Error("plugins.extraction.fileTooLarge");s=i.read(t,{type:"buffer",cellDates:!0,dense:!0,dateNF:e?.dateNF||void 0});}if(e?.mergedCellOptions)for(let r of s.SheetNames){let n=s.Sheets[r];s.Sheets[r]=k(n,e);}return (await Promise.all(s.SheetNames.map(async r=>{let n=s.Sheets[r],a=await J({sheet:n,sheetName:r,rawNumbers:e?.rawNumbers??!1,raw:e?.raw??!1,headerDetectionOptions:e?.headerDetectionOptions??{algorithm:"default"},headerSelectionEnabled:e?.headerSelectionEnabled??!1,skipEmptyLines:e?.skipEmptyLines??!1,debug:e?.debug,cascadeRowValues:e?.cascadeRowValues,rowsToSearch:e?.rowsToSearch??20,cascadeHeaderValues:e?.cascadeHeaderValues,getHeaders:e?.getHeaders});return [r,a]}))).reduce((r,[n,a])=>(a&&(r[n]=a),r),{})}async function J({sheet:t,sheetName:e,rawNumbers:s,raw:o,headerDetectionOptions:r,headerSelectionEnabled:n,skipEmptyLines:a,debug:l,cascadeRowValues:c,cascadeHeaderValues:u,rowsToSearch:p,getHeaders:V}){let w=i.utils.sheet_to_json(t,{header:"A",defval:null,rawNumbers:s,raw:o,blankrows:n||!a});if(w.length===0)return;let v=Object.keys(w[0]),f=w.map(m=>Object.values(m));for(;f.length>0&&f[f.length-1].every(d);)f.pop();if(f.length===0){l&&console.log(`No data rows found in '${e}'`);return}let x=[...f.slice(0,p)];u&&(x=_(x),l&&console.log(`Applied cascadeHeaderValues to '${e}'`));let W=x.map(m=>m.map(S=>S===null?"":String(S))),{headerRow:R,header:T}=await V(r,W);l&&(console.log(`@debug Detected ${R} rows to skip`),console.log("@debug Detected header row:",T));let C=N(T);l&&console.log("@debug Cleaned detected header:",C);let g=f;if(n||(g=f.slice(R)),g.length===0){l&&console.log(`@debug No data rows remaining after header processing in '${e}'`);return}c&&(g=D(g),l&&console.log(`@debug Applied cascadeRowValues to '${e}'`));let b;n?b=v.slice(0,C.length):b=C;let y=A(b);if(l&&console.log("@debug uniqueHeaders",y),y.length===0){l&&console.log(`@debug No headers found in '${e}'`);return}let M=g.map(m=>m.reduce((S,P,j)=>{let E=y[j];return E&&(S[E]={value:P}),S},{})),H;return n&&(H={rowHeaders:[R]}),{headers:y,data:M,metadata:H}}var K=t=>Extractor(/\.(xlsx?|xlsm|xlsb|xltx?|xltm)$/i,"excel",L,t),ce=L,ie=K;
export { K as ExcelExtractor, ce as excelParser, ie as xlsxExtractorPlugin };
//# sourceMappingURL=index.js.map
//# sourceMappingURL=index.js.map