@zazuko/trifid-plugin-sparql-proxy
Version:
Trifid plugin for sparql-proxy
392 lines (339 loc) • 14.8 kB
JavaScript
// @ts-check
import { Readable } from 'node:stream'
import { ReadableStream } from 'node:stream/web'
import { performance } from 'node:perf_hooks'
import { Worker } from 'node:worker_threads'
import { sparqlGetRewriteConfiguration } from 'trifid-core'
import rdf from '@zazuko/env-node'
import ReplaceStream from './lib/ReplaceStream.js'
import { authBasicHeader, objectLength, isValidUrl } from './lib/utils.js'
// TODO: remove this once QLever supports other formats (experimental flag that would be removed at any time)
const engineMode = process.env.TRIFID_ENGINE_MODE || 'default'
const defaultConfiguration = {
endpointUrl: '',
username: '',
password: '',
endpoints: {},
datasetBaseUrl: '',
allowRewriteToggle: true, // Allow the user to toggle the rewrite configuration using the `rewrite` query parameter.
rewrite: false, // Rewrite by default
rewriteQuery: true, // Allow rewriting the query
rewriteResults: true, // Allow rewriting the results
formats: {},
queryLogLevel: 'debug', // Log level for queries
serviceDescriptionWorkerUrl: new URL('./lib/serviceDescriptionWorker.js', import.meta.url),
serviceDescriptionTimeout: 5000, // max time to wait for the service description
serviceDescriptionFormat: undefined, // override the accept header for the service description request. by default, will use content negotiation using formats `@zazuko/env-node` can parse
}
const oneMonthMilliseconds = 60 * 60 * 24 * 30 * 1000
const DEFAULT_ENDPOINT_NAME = 'default'
/** @type {import('../core/types/index.js').TrifidPlugin} */
const factory = async (trifid) => {
const { logger, config, trifidEvents } = trifid
const endpoints = new Map()
const options = { ...defaultConfiguration, ...config }
let dynamicEndpoints = false
if (objectLength(options.endpoints) > 0) {
// Check if the default endpoint is defined
if (!Object.hasOwnProperty.call(options.endpoints, DEFAULT_ENDPOINT_NAME)) {
throw Error('Missing default endpoint in the endpoints configuration')
}
// Override default values with the default endpoint values (in case it's a valid URL ; else it might be the default /query)
if (isValidUrl(options.endpoints.default)) {
options.endpointUrl = options.endpoints.default.url || ''
options.username = options.endpoints.default.username || ''
options.password = options.endpoints.default.password || ''
}
// Support for multiple endpoints
dynamicEndpoints = true
}
if (!options.endpointUrl) {
throw Error(
dynamicEndpoints
? `Missing endpoints.${DEFAULT_ENDPOINT_NAME}.url parameter`
: 'Missing endpointUrl parameter',
)
}
let authorizationHeader
if (options.username && options.password) {
authorizationHeader = authBasicHeader(options.username, options.password)
}
const datasetBaseUrl = options.datasetBaseUrl
const allowRewriteToggle = options.allowRewriteToggle
const rewriteConfigValue = options.rewrite
const rewriteConfig = sparqlGetRewriteConfiguration(rewriteConfigValue, datasetBaseUrl)
endpoints.set(DEFAULT_ENDPOINT_NAME, {
endpointUrl: options.endpointUrl,
username: options.username,
password: options.password,
authorizationHeader,
datasetBaseUrl,
allowRewriteToggle,
rewriteConfigValue,
rewriteConfig,
})
if (dynamicEndpoints) {
for (const [endpointName, endpointConfig] of Object.entries(options.endpoints)) {
if (endpointName === DEFAULT_ENDPOINT_NAME) {
continue
}
if (!endpointConfig.url) {
throw Error(`Missing endpoints.${endpointName}.url parameter`)
}
let endpointAuthorizationHeader
if (endpointConfig.username && endpointConfig.password) {
endpointAuthorizationHeader = authBasicHeader(endpointConfig.username, endpointConfig.password)
}
const endpointDatasetBaseUrl = endpointConfig.datasetBaseUrl || datasetBaseUrl
const endpointRewriteConfigValue = endpointConfig.rewrite ?? rewriteConfigValue
endpoints.set(endpointName, {
endpointUrl: endpointConfig.url || '',
username: endpointConfig.username || '',
password: endpointConfig.password || '',
authorizationHeader: endpointAuthorizationHeader,
datasetBaseUrl: endpointDatasetBaseUrl,
allowRewriteToggle: endpointConfig.allowRewriteToggle ?? allowRewriteToggle,
rewriteConfigValue: endpointRewriteConfigValue,
rewriteConfig: sparqlGetRewriteConfiguration(endpointRewriteConfigValue, endpointDatasetBaseUrl),
})
}
}
const queryLogLevel = options.queryLogLevel
if (!logger[queryLogLevel]) {
throw Error(`Invalid queryLogLevel: ${queryLogLevel}`)
}
/**
* Log a query, depending on the `queryLogLevel`.
* @param {string} msg Message to log
* @returns {void}
*/
const queryLogger = (msg) => logger[queryLogLevel](msg)
const worker = new Worker(options.serviceDescriptionWorkerUrl)
worker.postMessage({
type: 'config',
data: {
endpointUrl: options.endpointUrl,
serviceDescriptionTimeout: options.serviceDescriptionTimeout,
serviceDescriptionFormat: options.serviceDescriptionFormat,
authorizationHeader,
},
})
const serviceDescription = new Promise((resolve) => {
const minimalSD = rdf.clownface().blankNode().addOut(rdf.ns.rdf.type, rdf.ns.sd.Service)
worker.once('message', async (message) => {
const { type, data } = message
switch (type) {
case 'serviceDescription':
resolve(await rdf.dataset().import(
rdf.formats.parsers.import('application/n-triples', Readable.from(data)),
))
break
case 'serviceDescriptionTimeOut':
logger.warn('The proxied SPARQL endpoint did not return a Service Description in a timely fashion. Will return a minimal document')
logger.info('You can increase the timeout using the \'serviceDescriptionTimeout\' configuration')
resolve(minimalSD.dataset)
break
case 'serviceDescriptionError':
logger.error('Error while fetching the Service Description. Will return a minimal document')
logger.error(data)
resolve(minimalSD.dataset)
break
}
})
})
trifidEvents.on('close', async () => {
logger.debug('Got "close" event from Trifid ; closing worker…')
await worker.terminate().catch(logger.error.bind(logger))
logger.debug('Worker terminated')
})
return {
defaultConfiguration: async () => {
return {
methods: ['GET', 'POST'],
paths: [
'/query',
'/query/',
],
}
},
routeHandler: async () => {
/**
* Query string type.
*
* @typedef {Object} QueryString
* @property {string} [query] The SPARQL query.
* @property {string} [rewrite] Should the query and the results be rewritten?
* @property {string} [format] The format of the results.
* @property {string} [endpoint] The name of the endpoint to use (default: DEFAULT_ENDPOINT_NAME).
*/
/**
* Request body type.
* @typedef {Object} RequestBody
* @property {string} [query] The SPARQL query.
*/
/**
* Route handler.
* @param {import('fastify').FastifyRequest<{ Querystring: QueryString, Body: RequestBody | string }> & { cookies: { endpointName?: string }, accepts: () => { type: (types: string[]) => string[] | string | false }}} request Request.
* @param {import('fastify').FastifyReply & { setCookie: (name: string, value: string, opts?: any) => {}, clearCookie: (name: string, opts?: any) => {}}} reply Reply.
*/
const handler = async (request, reply) => {
const savedEndpointName = request.cookies.endpointName || DEFAULT_ENDPOINT_NAME
let endpointName = request.query.endpoint || savedEndpointName
endpointName = endpointName.replace(/[^a-z0-9-]/gi, '')
// Only set the cookie if the endpoint name has changed and if it's not the default endpoint
if (request.cookies.endpointName !== endpointName && endpointName !== DEFAULT_ENDPOINT_NAME) {
reply.setCookie('endpointName', endpointName, { maxAge: oneMonthMilliseconds, path: '/' })
// Clear the cookie if the endpoint name is the default one
} else if (endpointName === DEFAULT_ENDPOINT_NAME && request.cookies.endpointName !== undefined) {
reply.clearCookie('endpointName', { path: '/' })
}
const endpoint = endpoints.get(endpointName)
if (!endpoint) {
return reply.callNotFound()
}
logger.debug(`Using endpoint: ${endpointName}`)
let requestPort = ''
if (request.port) {
requestPort = `:${request.port}`
}
const fullUrl = `${request.protocol}://${request.hostname}${requestPort}${request.url}`
const fullUrlObject = new URL(fullUrl)
const fullUrlPathname = fullUrlObject.pathname
// Generate the IRI we expect
fullUrlObject.search = ''
fullUrlObject.searchParams.forEach((_value, key) => fullUrlObject.searchParams.delete(key))
const iriUrlString = fullUrlObject.toString()
// Handle Service Description request
if (Object.keys(request.query).length === 0 && request.method === 'GET') {
const dataset = rdf.dataset(await serviceDescription)
rdf.clownface({ dataset })
.has(rdf.ns.rdf.type, rdf.ns.sd.Service)
.addOut(rdf.ns.sd.endpoint, rdf.namedNode(fullUrl))
const accept = request.accepts()
const negotiatedTypes = accept.type([...rdf.formats.serializers.keys()])
const negotiatedType = Array.isArray(negotiatedTypes) ? negotiatedTypes[0] : negotiatedTypes
if (!negotiatedType) {
reply.code(406).send()
return reply
}
reply
.header('content-type', negotiatedType)
// @ts-ignore (cause: broken type definitions)
.send(await dataset.serialize({ format: negotiatedType }))
return reply
}
// Enforce non-trailing slash
if (fullUrlPathname.slice(-1) === '/') {
reply.redirect(`${fullUrlPathname.slice(0, -1)}`)
return reply
}
let currentRewriteConfig = endpoint.rewriteConfig
if (endpoint.allowRewriteToggle) {
let rewriteConfigValueFromQuery = endpoint.rewriteConfigValue
if (`${request.query.rewrite}` === 'false') {
rewriteConfigValueFromQuery = false
} else if (`${request.query.rewrite}` === 'true') {
rewriteConfigValueFromQuery = true
}
currentRewriteConfig = sparqlGetRewriteConfiguration(rewriteConfigValueFromQuery, endpoint.datasetBaseUrl)
}
const { rewrite: rewriteValue, iriOrigin } = currentRewriteConfig
const rewriteResponse = rewriteValue
? {
origin: endpoint.datasetBaseUrl,
replacement: iriOrigin(iriUrlString),
}
: false
let query = ''
const method = request.method
switch (method) {
case 'GET':
query = request.query.query || ''
break
case 'POST':
if (typeof request.body === 'string') {
query = request.body
}
if (typeof request.body !== 'string' && request.body.query) {
query = request.body.query
}
if (typeof query !== 'string') {
query = JSON.stringify(query)
}
break
default:
reply.code(405).send('Method Not Allowed')
return reply
}
if (rewriteResponse && options.rewriteQuery) {
query = query.replaceAll(rewriteResponse.replacement, rewriteResponse.origin)
}
logger.debug('Got a request to the sparql proxy')
queryLogger(`Received query${rewriteValue ? ' (rewritten)' : ''} via ${method}:\n${query}`)
try {
let acceptHeader = request.headers.accept || 'application/sparql-results+json'
if (request.query.format) {
acceptHeader = options.formats[request.query.format] || acceptHeader
}
// TODO: remove this tweak once QLever supports other formats
if (engineMode === 'qlever' && !acceptHeader.startsWith('application/sparql-results+json')) {
acceptHeader = 'text/turtle'
}
const headers = {
'Content-Type': 'application/x-www-form-urlencoded',
Accept: acceptHeader,
}
if (endpoint.authorizationHeader) {
headers.Authorization = endpoint.authorizationHeader
}
const start = performance.now()
let response = await fetch(endpoint.endpointUrl, {
method: 'POST',
headers,
body: new URLSearchParams({ query }),
})
const end = performance.now()
const duration = end - start
if (!response) {
logger.warn('No response from the endpoint, make sure that the endpoint is reachable')
response = new Response(JSON.stringify({
success: false,
message: 'No response from the endpoint',
}), { status: 502, headers: { 'content-type': 'application/json' } })
}
const contentType = response.headers.get('content-type')
/** @type {any} */
let responseStream = response.body
if (rewriteResponse && options.rewriteResults) {
const replaceStream = new ReplaceStream(rewriteResponse.origin, rewriteResponse.replacement)
responseStream = Readable
.from(responseStream)
.pipe(replaceStream)
responseStream = Readable
.from(responseStream)
}
if (responseStream instanceof ReadableStream) {
responseStream = Readable.fromWeb(responseStream)
}
let proxyReply = reply
.status(response.status)
.header('Server-Timing', `sparql-proxy;dur=${duration};desc="Querying the endpoint"`)
if (contentType) {
proxyReply = proxyReply.header('content-type', contentType)
}
proxyReply.send(responseStream)
return proxyReply
} catch (error) {
logger.error('Error while querying the endpoint')
logger.error(error)
reply
.code(500)
.send('Error while querying the endpoint')
return reply
}
}
return handler
},
}
}
export default factory