UNPKG

@forzalabs/remora

Version:

A powerful CLI tool for seamless data translation.

187 lines (186 loc) 10.8 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const Affirm_1 = __importDefault(require("../core/Affirm")); const SecretManager_1 = __importDefault(require("../engines/SecretManager")); const DriverHelper_1 = __importDefault(require("./DriverHelper")); /** * Delta Share (Databricks Delta Sharing) Source Driver */ class DeltaShareSourceDriver { constructor() { this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query'; this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version'; this._tablesInShare = '{prefix}/shares/{share}/all-tables'; this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables'; this._schemasInShare = '{prefix}/shares/{share}/schemas'; this._shares = '{prefix}/shares'; this.init = (source) => __awaiter(this, void 0, void 0, function* () { (0, Affirm_1.default)(source, 'Invalid source'); // Expected authentication shape for delta-share const { authentication } = source; (0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source'); this._shareUrl = authentication.host; this._bearerToken = SecretManager_1.default.replaceSecret(authentication.bearerToken || authentication.sessionToken || authentication.password); this._share = authentication.share; this._schema = authentication.schema; this._table = authentication.table; (0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host'); (0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)'); (0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)'); (0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema'); (0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)'); this._source = source; return this; }); // Delta Sharing is not a SQL engine; expose explicit error this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () { void __sql; throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine'); }); this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () { void __sql; void __values; throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine'); }); this.readAll = (request) => __awaiter(this, void 0, void 0, function* () { var _a, _b, _c; (0, Affirm_1.default)(request, `Invalid download request`); (0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`); const deltaFiles = yield this._getAllFilesInTables(this._table); const hyparquet = yield import('hyparquet'); const { asyncBufferFromUrl, parquetReadObjects } = hyparquet; const lines = []; for (const deltaFile of deltaFiles) { const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size; const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength }); const parquetRecords = yield parquetReadObjects({ file: file }); lines.push(...parquetRecords.map(x => JSON.stringify(x))); } return lines; }); this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () { var _a, _b, _c; (0, Affirm_1.default)(request, 'Invalid read request'); (0, Affirm_1.default)(request.options, 'Invalid read options'); (0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range'); const deltaFiles = yield this._getAllFilesInTables(this._table); const { options: { lineFrom, lineTo } } = request; const hyparquet = yield import('hyparquet'); const { asyncBufferFromUrl, parquetReadObjects } = hyparquet; const lines = []; let index = 0; for (const deltaFile of deltaFiles) { const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size; const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength }); const parquetRecords = yield parquetReadObjects({ file: file }); for (const record of parquetRecords) { if (index >= lineFrom && index < lineTo) lines.push(JSON.stringify(record)); index++; if (index >= lineTo) break; } } return lines; }); this.download = (dataset) => __awaiter(this, void 0, void 0, function* () { var _a, _b, _c; (0, Affirm_1.default)(dataset, 'Invalid dataset'); const deltaFiles = yield this._getAllFilesInTables(this._table); const hyparquet = yield import('hyparquet'); const { asyncBufferFromUrl, parquetReadObjects } = hyparquet; // For each file, download it with the hyparquet package, read lines, then save locally to create the dataset let index = 0; let totalLineCount = 0; for (const deltaFile of deltaFiles) { const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size; const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength }); const parquetRecords = yield parquetReadObjects({ file: file }); if (index === 0 && parquetRecords.length > 0) { // I intentionally keep the first record as a JSON, so it can be used to extract the dimensions dataset.setFirstLine(JSON.stringify(parquetRecords[0])); } totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({ append: index > 0, delimiter: dataset.getDelimiter(), destinationPath: dataset.getPath(), objects: parquetRecords }); index++; } dataset.setCount(totalLineCount); return dataset; }); this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () { void __producer; try { yield this._getAllFilesInTables(this._table); // If it doesn't exist, then it fails in the above function return true; } catch (_a) { return false; } }); this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () { const url = this._version .replace('{prefix}', this._shareUrl) .replace('{share}', this._share) .replace('{schema}', this._schema) .replace('{table}', table); const res = yield fetch(url, { method: 'GET', headers: { Authorization: `Bearer ${this._bearerToken}` } }); (0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText} (${yield res.text()})`); const version = res.headers['delta-table-version']; return version; }); this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () { const url = this._query .replace('{prefix}', this._shareUrl) .replace('{share}', this._share) .replace('{schema}', this._schema) .replace('{table}', table); const body = { version: yield this._getVersion(table) }; const res = yield fetch(url, { method: 'POST', headers: { 'Authorization': `Bearer ${this._bearerToken}`, 'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors' }, body: JSON.stringify(body) }); const rawText = yield res.text(); (0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`); // By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards const deltaLines = rawText .split('\n') .filter(x => x.length > 0) .slice(2) .map(x => JSON.parse(x)); return deltaLines; }); this.ready = (request) => __awaiter(this, void 0, void 0, function* () { void request; throw new Error('DeltaShareSourceDriver.ready is not supported: Delta Sharing does not support readiness checks'); }); } } exports.default = DeltaShareSourceDriver;