@forzalabs/remora
Version:
A powerful CLI tool for seamless data translation.
187 lines (186 loc) • 10.8 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const Affirm_1 = __importDefault(require("../core/Affirm"));
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
/**
* Delta Share (Databricks Delta Sharing) Source Driver
*/
class DeltaShareSourceDriver {
constructor() {
this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query';
this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version';
this._tablesInShare = '{prefix}/shares/{share}/all-tables';
this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables';
this._schemasInShare = '{prefix}/shares/{share}/schemas';
this._shares = '{prefix}/shares';
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
(0, Affirm_1.default)(source, 'Invalid source');
// Expected authentication shape for delta-share
const { authentication } = source;
(0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source');
this._shareUrl = authentication.host;
this._bearerToken = SecretManager_1.default.replaceSecret(authentication.bearerToken || authentication.sessionToken || authentication.password);
this._share = authentication.share;
this._schema = authentication.schema;
this._table = authentication.table;
(0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host');
(0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)');
(0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)');
(0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema');
(0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)');
this._source = source;
return this;
});
// Delta Sharing is not a SQL engine; expose explicit error
this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () {
void __sql;
throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine');
});
this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () {
void __sql;
void __values;
throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine');
});
this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c;
(0, Affirm_1.default)(request, `Invalid download request`);
(0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`);
const deltaFiles = yield this._getAllFilesInTables(this._table);
const hyparquet = yield import('hyparquet');
const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
const lines = [];
for (const deltaFile of deltaFiles) {
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
const parquetRecords = yield parquetReadObjects({ file: file });
lines.push(...parquetRecords.map(x => JSON.stringify(x)));
}
return lines;
});
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c;
(0, Affirm_1.default)(request, 'Invalid read request');
(0, Affirm_1.default)(request.options, 'Invalid read options');
(0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range');
const deltaFiles = yield this._getAllFilesInTables(this._table);
const { options: { lineFrom, lineTo } } = request;
const hyparquet = yield import('hyparquet');
const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
const lines = [];
let index = 0;
for (const deltaFile of deltaFiles) {
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
const parquetRecords = yield parquetReadObjects({ file: file });
for (const record of parquetRecords) {
if (index >= lineFrom && index < lineTo)
lines.push(JSON.stringify(record));
index++;
if (index >= lineTo)
break;
}
}
return lines;
});
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c;
(0, Affirm_1.default)(dataset, 'Invalid dataset');
const deltaFiles = yield this._getAllFilesInTables(this._table);
const hyparquet = yield import('hyparquet');
const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
// For each file, download it with the hyparquet package, read lines, then save locally to create the dataset
let index = 0;
let totalLineCount = 0;
for (const deltaFile of deltaFiles) {
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
const parquetRecords = yield parquetReadObjects({ file: file });
if (index === 0 && parquetRecords.length > 0) {
// I intentionally keep the first record as a JSON, so it can be used to extract the dimensions
dataset.setFirstLine(JSON.stringify(parquetRecords[0]));
}
totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({
append: index > 0,
delimiter: dataset.getDelimiter(),
destinationPath: dataset.getPath(),
objects: parquetRecords
});
index++;
}
dataset.setCount(totalLineCount);
return dataset;
});
this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () {
void __producer;
try {
yield this._getAllFilesInTables(this._table);
// If it doesn't exist, then it fails in the above function
return true;
}
catch (_a) {
return false;
}
});
this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () {
const url = this._version
.replace('{prefix}', this._shareUrl)
.replace('{share}', this._share)
.replace('{schema}', this._schema)
.replace('{table}', table);
const res = yield fetch(url, {
method: 'GET',
headers: {
Authorization: `Bearer ${this._bearerToken}`
}
});
(0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText} (${yield res.text()})`);
const version = res.headers['delta-table-version'];
return version;
});
this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () {
const url = this._query
.replace('{prefix}', this._shareUrl)
.replace('{share}', this._share)
.replace('{schema}', this._schema)
.replace('{table}', table);
const body = {
version: yield this._getVersion(table)
};
const res = yield fetch(url, {
method: 'POST',
headers: {
'Authorization': `Bearer ${this._bearerToken}`,
'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors'
},
body: JSON.stringify(body)
});
const rawText = yield res.text();
(0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`);
// By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards
const deltaLines = rawText
.split('\n')
.filter(x => x.length > 0)
.slice(2)
.map(x => JSON.parse(x));
return deltaLines;
});
this.ready = (request) => __awaiter(this, void 0, void 0, function* () {
void request;
throw new Error('DeltaShareSourceDriver.ready is not supported: Delta Sharing does not support readiness checks');
});
}
}
exports.default = DeltaShareSourceDriver;