UNPKG

notion-md-crawler

Version:

A library to recursively retrieve and serialize Notion pages with customization for machine learning applications.

6 lines 9.43 kB
var W=Object.defineProperty;var Y=(e,t)=>{for(var r in t)W(e,r,{get:t[r],enumerable:!0})};import{indent as tt}from"md-utils-ts";import{APIErrorCode as J,collectPaginatedAPI as X,isNotionClientError as Z}from"@notionhq/client";var f=(e,t)=>t in e,B=e=>new Promise(t=>setTimeout(t,e));var ee=e=>Z(e)&&e.code===J.RateLimited,T=async(e,t=5,r=1e3)=>{try{return await e()}catch(a){if(ee(a)){if(t===0)throw a;return console.log(`Rate limited. Retries left: ${t}. Waiting ${r}ms before retrying...`),await B(r),T(e,t-1,r*2)}else throw a}},b=e=>async t=>T(()=>X(e.blocks.children.list,{block_id:t})).catch(()=>[]),R=e=>t=>T(()=>e.pages.retrieve({page_id:t})),D=e=>t=>T(()=>e.databases.query({database_id:t}).then(({results:r})=>r)).catch(()=>[]);import*as o from"md-utils-ts";var P={};Y(P,{annotate:()=>M,fromDate:()=>S,fromLink:()=>h,fromRichText:()=>c,fromUser:()=>_});import*as m from"md-utils-ts";var M=(e,t)=>(t.code&&(e=m.inlineCode(e)),t.bold&&(e=m.bold(e)),t.italic&&(e=m.italic(e)),t.strikethrough&&(e=m.del(e)),t.underline&&(e=m.underline(e)),e),c=(e,t=!1)=>e.map(({plain_text:r,annotations:a,href:i})=>{if(r.match(/^\s*$/))return r;let l=r.match(/^(\s*)/),n=r.match(/(\s*)$/),d=l?l[0]:"",x=n?n[0]:"",s=r.trim();if(s==="")return d+x;let p=M(s,a),u=i?m.anchor(p,t||i):p;return d+u+x}).join(""),h=e=>{let t=c(e.caption),r=e.type==="external"?e.external.url:e.file.url,a=r.match(/[^\/\\&\?]+\.\w{3,4}(?=([\?&].*$|$))/);return{title:t.trim()?t:a?a[0]:"link",href:r}},_=e=>{var r;if(!f(e,"type"))return"<empty>";let t=(r=e.name)!=null?r:"<empty>";return e.type==="person"?`${t}`:`${t}[bot]`},S=e=>e?e.end?`(start)${e.start}, (end): ${e.end}`:e.start:"<empty>";var te=({urlMask:e})=>t=>{let{title:r,href:a}=h(t.audio);return o.anchor(r,e||a)},re=({urlMask:e})=>t=>o.anchor(c(t.bookmark.caption),e||t.bookmark.url),ae=()=>()=>!1,ie=({urlMask:e})=>t=>o.bullet(c(t.bulleted_list_item.rich_text,e)),oe=({urlMask:e})=>t=>o.quote(c(t.callout.rich_text,e)),ne=()=>e=>`[${e.child_page.title}]`,le=()=>e=>`[${e.child_database.title}]`,ce=({urlMask:e})=>t=>o.codeBlock(t.code.language)(c(t.code.rich_text,e)),se=()=>()=>!1,pe=()=>()=>!1,de=()=>()=>o.hr(),me=({urlMask:e})=>t=>{let r=c(t.embed.caption,e);return o.anchor(r,e||t.embed.url)},ue=()=>e=>o.equationBlock(e.equation.expression),ye=({urlMask:e})=>t=>{let{title:r,href:a}=h(t.file);return o.anchor(r,e||a)},ge=({urlMask:e})=>t=>o.h1(c(t.heading_1.rich_text,e)),fe=({urlMask:e})=>t=>o.h2(c(t.heading_2.rich_text,e)),xe=({urlMask:e})=>t=>o.h3(c(t.heading_3.rich_text,e)),he=({urlMask:e})=>t=>{let{title:r,href:a}=h(t.image);return o.image(r,e||a)},be=({urlMask:e})=>t=>o.anchor(t.type,e||t.link_preview.url),_e=({urlMask:e})=>t=>{let r=t.link_to_page.type==="page_id"?t.link_to_page.page_id:"";return o.anchor(t.type,e||r)},ke=({urlMask:e})=>t=>o.bullet(c(t.numbered_list_item.rich_text,e),1),Te=({urlMask:e})=>t=>c(t.paragraph.rich_text,e),Se=({urlMask:e})=>t=>{let{title:r,href:a}=h(t.pdf);return o.anchor(r,e||a)},ze=({urlMask:e})=>t=>o.quote(c(t.quote.rich_text,e)),Fe=()=>()=>!1,Pe=()=>()=>!1,we=()=>()=>!1,Ce=({urlMask:e})=>t=>`| ${t.table_row.cells.flatMap(r=>r.map(a=>c([a],e))).join(" | ")} |`,$e=({urlMask:e})=>t=>c(t.template.rich_text,e),Ne=({urlMask:e})=>t=>o.todo(c(t.to_do.rich_text,e),t.to_do.checked),Be=({urlMask:e})=>t=>c(t.toggle.rich_text,e),Re=()=>()=>!1,De=({urlMask:e})=>t=>{let{title:r,href:a}=h(t.video);return o.anchor(r,e||a)},w=e=>({audio:te(e),bookmark:re(e),breadcrumb:ae(e),bulleted_list_item:ie(e),callout:oe(e),child_database:le(e),child_page:ne(e),code:ce(e),column:se(e),column_list:pe(e),divider:de(e),embed:me(e),equation:ue(e),file:ye(e),heading_1:ge(e),heading_2:fe(e),heading_3:xe(e),image:he(e),link_preview:be(e),link_to_page:_e(e),numbered_list_item:ke(e),paragraph:Te(e),pdf:Se(e),quote:ze(e),synced_block:Fe(e),table:Pe(e),table_of_contents:we(e),table_row:Ce(e),template:$e(e),to_do:Ne(e),toggle:Be(e),unsupported:Re(e),video:De(e)}),O=w({urlMask:!1});var E=w;var I={defaults:O,strategy:E};import{anchor as Me}from"md-utils-ts";var k=", ",g="<empty>",Oe=({urlMask:e})=>(t,r)=>`[${t}] ${r.checkbox}`,Ee=({urlMask:e})=>(t,r)=>`[${t}] ${_(r.created_by)}`,Ie=({urlMask:e})=>(t,r)=>`[${t}] ${r.created_time}`,L=({urlMask:e})=>(t,r)=>`[${t}] ${S(r.date)}`,Le=({urlMask:e})=>(t,r)=>{var a;return`[${t}] ${(a=r.email)!=null?a:g}`},je=({urlMask:e})=>(t,r)=>`[${t}] `+r.files.map(a=>{let i=f(a,"external")?a.external.url:a.file.url;return Me(a.name,i)}).join(k),qe=({urlMask:e})=>(t,r)=>{var a,i;switch(r.formula.type){case"string":return`[${t}] ${(a=r.formula.string)!=null?a:g}`;case"boolean":return`[${t}] ${(i=r.formula.boolean)!=null?i:g}`;case"date":return`[${t}] ${S(r.formula.date)}`;case"number":return`[${t}] ${r.formula.number}`}},ve=({urlMask:e})=>(t,r)=>`[${t}] ${_(r.last_edited_by)}`,Ue=({urlMask:e})=>(t,r)=>`[${t}] ${r.last_edited_time}`,Ae=({urlMask:e})=>(t,r)=>`[${t}] `+r.multi_select.map(a=>a.name).join(k),j=({urlMask:e})=>(t,r)=>{var a;return`[${t}] ${(a=r.number)!=null?a:g}`},He=({urlMask:e})=>(t,r)=>`[${t}] `+r.people.map(a=>_(a)).join(k),Ge=({urlMask:e})=>(t,r)=>{var a;return`[${t}] ${(a=r.phone_number)!=null?a:g}`},Ke=({urlMask:e})=>(t,r)=>`[${t}] `+r.relation.map(a=>`${a.id}`).join(k),Ve=({urlMask:e})=>(t,r)=>`[${t}] ${c(r.rich_text)}`,Qe=({urlMask:e})=>(t,r)=>{var a,i;return`[${t}] ${(i=(a=r.select)==null?void 0:a.name)!=null?i:g}`},We=({urlMask:e})=>(t,r)=>{var a,i;return`[${t}] ${(i=(a=r.status)==null?void 0:a.name)!=null?i:g}`},Ye=({urlMask:e})=>(t,r)=>`[${t}] ${c(r.title)}`,Je=({urlMask:e})=>(t,r)=>{var n,d;let a=(n=r.unique_id.prefix)!=null?n:"",i=(d=r.unique_id.number)!=null?d:"",l=a+i;return`[${t}] ${l||g}`},Xe=({urlMask:e})=>(t,r)=>{var a;return`[${t}] ${(a=r.url)!=null?a:g}`},Ze=({urlMask:e})=>()=>!1,q=e=>({checkbox:Oe(e),created_by:Ee(e),created_time:Ie(e),date:L(e),email:Le(e),files:je(e),formula:qe(e),last_edited_by:ve(e),last_edited_time:Ue(e),multi_select:Ae(e),number:j(e),people:He(e),phone_number:Ge(e),relation:Ke(e),rich_text:Ve(e),select:Qe(e),status:We(e),title:Ye(e),unique_id:Je(e),url:Xe(e),verification:Ze(e)}),et=e=>(t,r)=>{switch(r.rollup.type){case"number":return j(e)(t,r.rollup);case"date":return L(e)(t,r.rollup);case"array":let a=q(e);return Promise.all(r.rollup.array.map(i=>a[i.type](t,i))).then(i=>`[${t}] `+i.map(l=>l).map(l=>l.replace(`[${t}] `,"")).join(k))}},C=e=>({...q(e),rollup:et(e)}),v=C({urlMask:!1});var U=C;var A={defaults:v,strategy:U},H=e=>t=>Promise.all(Object.entries(t).map(([r,a])=>e[a.type](r,a))).then(r=>r.filter(a=>a!==!1));var z={block:I,property:A,utils:P};var F=(e,t)=>t.includes(e.type),N=(e,t)=>t&&t.includes(e),G=e=>async(t,r,a,i)=>{let l={id:t.id,title:r,createdTime:t.created_time,lastEditedTime:t.last_edited_time,parentId:a==null?void 0:a.metadata.id},n=e?await e({page:t,title:r,properties:i,parent:a}):{};return{metadata:{...l,...n},properties:i||[],lines:[]}},rt=["table","table_row","column_list","column"],at=tt(),it=(e,{urlMask:t=!1,serializers:r})=>({...z.block.strategy({urlMask:t}),...r==null?void 0:r.block})[e],ot=e=>F(e,["child_page","child_database"]),K=e=>({id:e.metadata.id,success:!0,page:e}),nt=(e,t)=>({id:e.metadata.id,success:!1,failure:{parentId:e.metadata.parentId,reason:t instanceof Error?`${t.name}: ${t.message} ${t.stack}`:`${t}`}}),$=e=>async(t,r=0)=>{var l;let a=[],i=[];for(let n of t){if(!f(n,"type"))continue;let{type:d}=n,s=await it(d,e)(n);if(s!==!1){let p=at(s,r);a.push(p)}if(ot(n)){i.push(n);continue}if(F(n,["synced_block"])){let p=((l=n.synced_block.synced_from)==null?void 0:l.block_id)||n.id,u=await b(e.client)(p),y=await $(e)(u,r);a=[...a,...y.lines],i=[...i,...y.pages];continue}if(n.has_children){let p=await b(e.client)(n.id),u=rt.includes(d)?r:r+1,y=await $(e)(p,u);a=[...a,...y.lines],i=[...i,...y.pages]}}return{lines:a,pages:i}},V=e=>async function*(t,r,a=0){try{let{client:i,metadataBuilder:l}=e,n=G(l),{lines:d,pages:x}=await $(e)(r,a);yield K({...t,lines:d});for(let s of x)if(!N(s.id,e.skipPageIds)){if(F(s,["child_page"])){let{title:p}=s.child_page,u=await n(s,p,t),y=await b(i)(s.id);yield*V(e)(u,y,0);continue}if(F(s,["child_database"])){let{title:p}=s.child_database,u=await n(s,p,t),y={...e,parent:u};yield*Q(y)(s.id)}}}catch(i){yield nt(t,i)}},lt=(e,t)=>{let{urlMask:r=!1,serializers:a}=t,i={...z.property.strategy({urlMask:r}),...a==null?void 0:a.property};return H(i)(e)},ct=e=>{if(!f(e,"properties"))return"";let t="";for(let r of Object.values(e.properties)){if(r.type!=="title")continue;t=z.property.defaults.title("",r).replace("[] ","")}return t},st=e=>async function*(t){let{client:r,parent:a,metadataBuilder:i,skipPageIds:l}=e;if(!N(t,l))try{let n=await R(r)(t);if(!f(n,"parent"))return yield{id:t,success:!1,failure:{parentId:a==null?void 0:a.metadata.id,reason:"Unintended Notion Page object."}};let d=await lt(n.properties,e),x=await b(r)(n.id),s=ct(n),u=await G(i)(n,s,a,d);yield*V(e)(u,x)}catch{yield*Q(e)(t)}},Q=e=>async function*(t){let{skipPageIds:r}=e;if(N(t,r))return;let a=st(e),i=await D(e.client)(t),{parent:l}=e;l&&(yield K(l));for(let n of i)yield*a(n.id)};import{h1 as pt}from"md-utils-ts";var dt=e=>e.match(/^#+\s/)?"#"+e:e,Kt=({metadata:e,properties:t,lines:r})=>{let a=pt(e.title),i=["---",t.join(` `),"---"].join(` `),l=r.map(dt);return[a,i,...l].join(` `)},Vt=async e=>{let t=[];for await(let r of e)t.push(r);return t};export{st as crawler,Q as dbCrawler,Kt as pageToString,z as serializer,Vt as waitAllResults}; //# sourceMappingURL=index.js.map