const mime = require('mime-types'); const FileType = require('file-type'); const request = require('@tryghost/request'); const errors = require('@tryghost/errors'); const logging = require('@tryghost/logging'); const string = require('@tryghost/string'); const path = require('path'); class ExternalMediaInliner { /** @type {object} */ #PostModel; /** @type {object} */ #PostMetaModel; /** @type {object} */ #TagModel; /** @type {object} */ #UserModel; /** * * @param {Object} deps * @param {Object} deps.PostModel - Post model * @param {Object} deps.PostMetaModel - PostMeta model * @param {Object} deps.TagModel - Tag model * @param {Object} deps.UserModel - User model * @param {(extension) => import('ghost-storage-base')} deps.getMediaStorage - getMediaStorage */ constructor(deps) { this.#PostModel = deps.PostModel; this.#PostMetaModel = deps.PostMetaModel; this.#TagModel = deps.TagModel; this.#UserModel = deps.UserModel; this.getMediaStorage = deps.getMediaStorage; } /** * * @param {string} requestURL - url of remote media * @returns {Promise} */ async getRemoteMedia(requestURL) { // @NOTE: this is the most expensive operation in the whole inlining process // we should consider caching the results to improve performance // Enforce http - http > https redirects are commonplace requestURL = requestURL.replace(/^\/\//g, 'http://'); // Encode to handle special characters in URLs requestURL = encodeURI(requestURL); try { const response = await request(requestURL, { followRedirect: true, responseType: 'buffer' }); return response; } catch (error) { // NOTE: add special case for 404s logging.error(`Error downloading remote media: ${requestURL}`); logging.error(new errors.DataImportError({ err: error })); return null; } } /** * * @param {Object} response - response from request * @returns {Object} */ async extractFileDataFromResponse(requestURL, response) { let extension; // Attempt to get the file extension from the file itself // If that fails, or if `.ext` is undefined, get the extension from the file path in the catch try { const fileInfo = await FileType.fromBuffer(response.body); extension = fileInfo.ext; } catch { const headers = response.headers; const contentType = headers['content-type']; const extensionFromPath = path.parse(requestURL).ext.split(/[^a-z]/i).filter(Boolean)[0]; extension = mime.extension(contentType) || extensionFromPath; } const removeExtRegExp = new RegExp(`.${extension}`, ''); const fileNameNoExt = path.parse(requestURL).base.replace(removeExtRegExp, ''); // CASE: Query strings _can_ form part of the unique image URL, so rather that strip them include the in the file name // Then trim to last 248 chars (this will be more unique than the first 248), and trim leading & trailing dashes. // 248 is on the lower end of limits from various OSes and file systems const fileName = string.slugify(path.parse(fileNameNoExt).base, { requiredChangesOnly: true }).slice(-248).replace(/^-|-$/, ''); return { fileBuffer: response.body, filename: `${fileName}.${extension}`, extension: `.${extension}` }; } /** * * @param {Object} media - media to store locally * @returns {Promise} - path to stored media */ async storeMediaLocally(media) { const storage = this.getMediaStorage(media.extension); if (!storage) { logging.warn(`No storage adapter found for file extension: ${media.extension}`); return null; } else { // @NOTE: this is extremely convoluted and should live on a // storage adapter level const targetDir = storage.getTargetDir(storage.storagePath); const uniqueFileName = await storage.getUniqueFileName({ name: media.filename }, targetDir); const targetPath = path.relative(storage.storagePath, uniqueFileName); const filePath = await storage.saveRaw(media.fileBuffer, targetPath); return filePath; } } /** * Find & inline external media from a JSON sting. * This works with both Lexical & Mobiledoc, so no separate methods are needed here. * * @param {string} content - stringified JSON of post Lexical or Mobiledoc content * @param {String[]} domains - domains to inline media from * @returns {Promise} - updated stringified JSON of post content */ async inlineContent(content, domains) { for (const domain of domains) { // NOTE: the src could end with a quote, apostrophe or double-backslash. backlashes are added to content // as an escape character const srcTerminationSymbols = `"|'|\\\\`; const regex = new RegExp(`(${domain}.*?)(${srcTerminationSymbols})`, 'igm'); const matches = content.matchAll(regex); for (const [,src] of matches) { const response = await this.getRemoteMedia(src); let media; if (response) { media = await this.extractFileDataFromResponse(src, response); } if (media) { const filePath = await this.storeMediaLocally(media); if (filePath) { const inlinedSrc = `__GHOST_URL__${filePath}`; // NOTE: does not account for duplicate images in content // in those cases would be processed twice content = content.replace(src, inlinedSrc); logging.info(`Inlined media: ${src} -> ${inlinedSrc}`); } } } } return content; } /** * * @param {Object} resourceModel - one of PostModel, TagModel, UserModel instances * @param {String[]} fields - fields to inline * @param {String[]} domains - domains to inline media from * @returns Promise - updated fields map with local media paths */ async inlineFields(resourceModel, fields, domains) { const updatedFields = {}; for (const field of fields) { for (const domain of domains) { const src = resourceModel.get(field); if (src && src.startsWith(domain)) { const response = await this.getRemoteMedia(src); let media; if (response) { media = await this.extractFileDataFromResponse(src, response); } if (media) { const filePath = await this.storeMediaLocally(media); if (filePath) { const inlinedSrc = `__GHOST_URL__${filePath}`; updatedFields[field] = inlinedSrc; logging.info(`Added media to inline: ${src} -> ${inlinedSrc}`); } } } } } return updatedFields; } /** * * @param {Object[]} resources - array of model instances * @param {Object} model - resource model * @param {string[]} fields - fields to inline * @param {string[]} domains - domains to inline media from */ async inlineSimpleFields(resources, model, fields, domains) { logging.info(`Starting inlining external media for ${resources?.length} resources and with ${fields.join(', ')} fields`); for (const resource of resources) { try { const updatedFields = await this.inlineFields(resource, fields, domains); if (Object.keys(updatedFields).length > 0) { await model.edit(updatedFields, { id: resource.id, context: { internal: true } }); } } catch (err) { logging.error(`Error inlining media for: ${resource.id}`); logging.error(new errors.DataImportError({ err })); } } } /** * * @param {string[]} domains domains to inline media from */ async inline(domains) { const {data: posts} = await this.#PostModel.findPage({ limit: 'all', status: 'all' }); const postsInilingFields = [ 'feature_image' ]; logging.info(`Starting inlining external media for posts: ${posts?.length}`); for (const post of posts) { try { const mobiledocContent = post.get('mobiledoc'); const lexicalContent = post.get('lexical'); const updatedFields = await this.inlineFields(post, postsInilingFields, domains); if (mobiledocContent) { const inlinedContent = await this.inlineContent(mobiledocContent, domains); // If content has changed, update the post if (inlinedContent !== mobiledocContent) { updatedFields.mobiledoc = inlinedContent; } } if (lexicalContent) { const inlinedContent = await this.inlineContent(lexicalContent, domains); // If content has changed, update the post if (inlinedContent !== lexicalContent) { updatedFields.lexical = inlinedContent; } } if (Object.keys(updatedFields).length > 0) { await this.#PostModel.edit(updatedFields, { id: post.id, context: { internal: true } }); } } catch (err) { logging.error(`Error inlining media for post: ${post.id}`); logging.error(new errors.DataImportError({ err })); } } const {data: postsMetas} = await this.#PostMetaModel.findPage({ limit: 'all' }); const postsMetaInilingFields = [ 'og_image', 'twitter_image' ]; await this.inlineSimpleFields(postsMetas, this.#PostMetaModel, postsMetaInilingFields, domains); const {data: tags} = await this.#TagModel.findPage({ limit: 'all' }); const tagInliningFields = [ 'feature_image', 'og_image', 'twitter_image' ]; await this.inlineSimpleFields(tags, this.#TagModel, tagInliningFields, domains); const {data: users} = await this.#UserModel.findPage({ limit: 'all' }); const userInliningFields = [ 'profile_image', 'cover_image' ]; await this.inlineSimpleFields(users, this.#UserModel, userInliningFields, domains); logging.info('Finished inlining external media for posts, tags, and users'); } } module.exports = ExternalMediaInliner;