UNPKG

change-propagation

Version:

Listens to events from Kafka and delivers them

691 lines (666 loc) 30.6 kB
spec: &spec x-sub-request-filters: - type: default name: http options: allow: - pattern: /^https?:\/\// forward_headers: user-agent: true title: The Change Propagation root paths: /sys/ores: x-modules: - path: sys/ores_updates.js options: ores_precache_uris: - 'https://ores.wikimedia.org/v3/precache' eventbus_uri: 'https://eventbus.stubfortests.org/v1/events' /sys/limit: x-modules: - path: sys/rate_limiter.js options: redis: &redis_config host: localhost port: 6379 limiters: blacklist: # First, allow no more then 100 errors per week # The precision parameter controls the step a sliding window moves by - interval: 604800 limit: 100 precision: 86400 # Secondly to avoid bursts in case of outages, don't allow more then 10 # errors per hour - interval: 3600 limit: 10 /sys/dedupe: x-modules: - path: sys/deduplicator.js options: redis: *redis_config /sys/purge: x-modules: - path: sys/purge.js options: host: 127.0.0.1 port: 4321 /sys/links: x-modules: - path: sys/dep_updates.js options: templates: mw_api: uri: 'https://{{message.meta.domain}}/w/api.php' headers: host: '{{message.meta.domain}}' body: formatversion: 2 /sys/queue: x-modules: - path: sys/kafka.js options: metadata_broker_list: 127.0.0.1:9092 dc_name: test_dc startup_delay: 0 consumer: # These options should not be copied to puppet config. # We're using this config for testing, so need to configure # for minimal latency fetch.wait.max.ms: "1" fetch.min.bytes: "1" queue.buffering.max.ms: "1" producer: queue.buffering.max.messages: "10" compression.codec: snappy concurrency: 250 # Redis-mock does not support evalsha that rate limiting depend on. disable_ratelimit: '{env(MOCK_SERVICES)}' blacklist: en.wikipedia.org: - 'User:Nolelover' - '/User:Cyberbot_I\//' templates: summary_definition_rerender: &summary_definition_rerender_spec topic: 'resource_change' retry_limit: 2 retry_delay: 500 retry_on: status: - '5xx' limiters: blacklist: 'summary:{message.meta.uri}' cases: # Non wiktionary domains - rerender summary - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/api\/rest_v1\/page\/html\/(?<title>[^/]+)$/' tags: - restbase match_not: - meta: domain: '/wiktionary.org$/' - meta: domain: /\.wikidata\.org$/ exec: method: get # Don't encode title since it should be already encoded uri: '{{match.meta.uri.proto}}://{{message.meta.domain}}/api/rest_v1/page/summary/{{match.meta.uri.title}}' query: redirect: false headers: cache-control: no-cache - match: # Wiktionary domains - rerender definitions meta: # These URIs are coming from RESTBase, so we know that article titles will be normalized # and main namespace articles will not have : (uri-encoded, so %3a or %3A) uri: '/^(?<proto>https?):\/\/[^\/]+\/api\/rest_v1\/page\/html\/(?<title>(?:(?!%3a|%3A|\/).)+)$/' domain: '/^en\.wiktionary\.org$/' tags: - restbase exec: method: get # Don't encode title since it should be already encoded uri: '{{match.meta.uri.proto}}://{{message.meta.domain}}/api/rest_v1/page/definition/{{match.meta.uri.title}}' query: redirect: false headers: cache-control: no-cache summary_definition_rerender_transcludes: &summary_definition_rerender_transcludes_spec <<: *summary_definition_rerender_spec topic: 'change-prop.transcludes.resource-change' mobile_rerender: &mobile_rerender_spec topic: 'resource_change' retry_limit: 2 retry_delay: 500 retry_on: status: - '5xx' limiters: blacklist: 'mobile:{message.meta.uri}' match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/api\/rest_v1\/page\/html\/(?<title>[^/]+)$/' domain: '/^.+\.wikipedia\.org$/' tags: - restbase exec: - method: get uri: '{{match.meta.uri.proto}}://{{message.meta.domain}}/api/rest_v1/page/mobile-sections/{{match.meta.uri.title}}' query: redirect: false headers: cache-control: no-cache # Until we start storing and actively rerendering PCS endpoints we still need to purge it from Varnish - method: post uri: '/sys/purge/' body: - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/media/{{match.meta.uri.title}}' mobile_rerender_transcludes: &mobile_rerender_transcludes_spec <<: *mobile_rerender_spec topic: 'change-prop.transcludes.resource-change' purge_varnish: &purge_varnish_spec topic: 'resource_change' match: meta: uri: '/^https?:\/\/[^\/]+\/api\/rest_v1\/(?<title>.+)$/' tags: - restbase exec: method: post uri: '/sys/purge/' body: - meta: uri: '//{{message.meta.domain}}/api/rest_v1/{{match.meta.uri.title}}' purge_varnish_transcludes: &purge_varnish_transcludes_spec <<: *purge_varnish_spec topic: 'change-prop.transcludes.resource-change' # RESTBase update jobs mw_purge: topic: resource_change limiters: blacklist: 'html:{message.meta.uri}' cases: - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/wiki\/(?<title>.+)$/' domain: '/.*\.wikipedia.org$/' tags: - purge exec: &mw_purge_wikipedia_rerender - method: get # This even comes directly from MediaWiki, so title is encoded in MW-specific way. # Re-encode the title in standard `encodeURIComponent` encoding. uri: '{{match.meta.uri.proto}}://{{message.meta.domain}}/api/rest_v1/page/html/{decode(match.meta.uri.title)}' headers: cache-control: no-cache if-unmodified-since: '{{date(message.meta.dt)}}' query: redirect: false # The HTML might not change but sometimes editors use a purge to drop incorrectly rendered summary/MCS # content, so let's purge them as well just in case. The rate is low. - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/summary/{decode(match.meta.uri.title)}' headers: cache-control: no-cache query: redirect: false - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/mobile-sections/{decode(match.meta.uri.title)}' headers: cache-control: no-cache query: redirect: false - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/wiki\/(?<title>.+)$/' domain: '/.*\.wiktionary.org$/' tags: - purge exec: &mw_purge_wiktionary_rerender - method: get # This even comes directly from MediaWiki, so title is encoded in MW-specific way. # Re-encode the title in standard `encodeURIComponent` encoding. uri: '{{match.meta.uri.proto}}://{{message.meta.domain}}/api/rest_v1/page/html/{decode(match.meta.uri.title)}' headers: cache-control: no-cache if-unmodified-since: '{{date(message.meta.dt)}}' query: redirect: false # The HTML might not change but sometimes editors use a purge to drop incorrectly rendered summary/MCS # content, so let's purge them as well just in case. The rate is low. - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/definition/{decode(match.meta.uri.title)}' headers: cache-control: no-cache query: redirect: false - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/wiki\/(?<title>.+)$/' tags: - purge match_not: &others_match_not - meta: domain: '/.*\.wikipedia\.org$/' - meta: domain: '/.*\.wiktionary\.org$/' - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: &mw_purge_others_rerender - method: get # This even comes directly from MediaWiki, so title is encoded in MW-specific way. # Re-encode the title in standard `encodeURIComponent` encoding. uri: '{{match.meta.uri.proto}}://{{message.meta.domain}}/api/rest_v1/page/html/{decode(match.meta.uri.title)}' headers: cache-control: no-cache if-unmodified-since: '{{date(message.meta.dt)}}' query: redirect: false # The HTML might not change but sometimes editors use a purge to drop incorrectly rendered summary/MCS # content, so let's purge them as well just in case. The rate is low. - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/summary/{decode(match.meta.uri.title)}' headers: cache-control: no-cache query: redirect: false null_edit: topic: resource_change ignore: status: - 403 # Ignoring 403 since some of the pages with high number of null_edit events are blacklisted - 412 limiters: blacklist: 'html:{message.meta.uri}' cases: - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/wiki\/(?<title>.+)$/' domain: '/.*\.wikipedia.org$/' tags: - null_edit exec: *mw_purge_wikipedia_rerender - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/wiki\/(?<title>.+)$/' domain: '/.*\.wiktionary.org$/' tags: - purge exec: *mw_purge_wiktionary_rerender - match: meta: uri: '/^(?<proto>https?):\/\/[^\/]+\/wiki\/(?<title>.+)$/' tags: - purge match_not: *others_match_not exec: *mw_purge_others_rerender page_edit: topic: mediawiki.revision-create limiters: blacklist: 'html:{message.meta.uri}' retry_on: status: - '5xx' - 404 # Sometimes occasional 404s happen because of the mysql replication lag, so retry match: rev_content_changed: true match_not: # Test-only. We use undefined rev_parent_id to test backlinks so we # don't want transclusions to interfere with backlinks test - rev_parent_id: undefined # end of test-only config - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/html/{message.page_title}/{{message.rev_id}}' headers: cache-control: no-cache x-restbase-parentrevision: '{{message.rev_parent_id}}' if-unmodified-since: '{{date(message.meta.dt)}}' query: redirect: false - method: post uri: '/sys/links/transcludes/{message.page_title}' body: '{{globals.message}}' revision_visibility_change: topic: mediawiki.revision-visibility-change ignore: status: - 403 # When the revision is hidden 403 will be returned by RESTBase, it's a valid situation - 412 match_not: - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/title/{message.page_title}/{{message.rev_id}}' headers: cache-control: no-cache query: redirect: false # For page revision restriction RESTBase doesn't emit resource_change events, and to go through # the normal purge chain (html update -> html resource_change -> summary update -> summary resource_change) # we need to add many workarounds/shortcurst in RESTBase. So having this list here is an OK compromise. # Only purge the URIs with a rev_id since the latest revision can not be restricted. - method: post uri: '/sys/purge/' body: - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/html/{message.page_title}/{{message.rev_id}}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/mobile-sections/{message.page_title}/{{message.rev_id}}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/mobile-sections-lead/{message.page_title}/{{message.rev_id}}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/mobile-sections-remaining/{message.page_title}/{{message.rev_id}}' page_delete: topic: mediawiki.page-delete ignore: status: - 404 # 404 is a normal response for page deletion - 412 match_not: - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/title/{message.page_title}' headers: cache-control: no-cache query: redirect: false # The links to the deleted page should become red again - method: post uri: '/sys/links/backlinks/{message.page_title}' body: '{{globals.message}}' # For page deletion RESTBase doesn't emit resource_change events, and to go through # the normal purge chain (html update -> html resource_change -> summary update -> summary resource_change) # we need to add many workarounds/shortcurst in RESTBase. So having this list here is an OK compromise. - method: post uri: '/sys/purge/' body: - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/html/{message.page_title}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/html/{message.page_title}/{{message.rev_id}}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/summary/{message.page_title}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/definition/{message.page_title}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/mobile-sections/{message.page_title}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/mobile-sections-lead/{message.page_title}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/mobile-sections-remaining/{message.page_title}' - meta: uri: '//{{message.meta.domain}}/api/rest_v1/page/media/{message.page_title}' page_restore: topic: mediawiki.page-undelete match_not: - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/title/{message.page_title}' headers: cache-control: no-cache query: redirect: false # The links to the deleted page should become red again - method: post uri: '/sys/links/backlinks/{message.page_title}' body: '{{globals.message}}' page_move: topic: mediawiki.page-move match_not: - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/html/{message.page_title}/{{message.rev_id}}' headers: cache-control: no-cache if-unmodified-since: '{{date(message.meta.dt)}}' query: redirect: false - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/title/{message.prior_state.page_title}' headers: cache-control: no-cache query: redirect: false on_transclusion_update: topic: change-prop.transcludes.resource-change limiters: blacklist: 'html:{message.meta.uri}' cases: - match: meta: schema_uri: 'resource_change/1' uri: '/https?:\/\/[^\/]+\/wiki\/(?<title>.+)/' tags: [ 'transcludes' ] exec: method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/html/{{match.meta.uri.title}}' headers: cache-control: no-cache if-unmodified-since: '{{date(message.root_event.dt)}}' x-restbase-mode: '{{message.tags[1]}}' query: redirect: false - match: meta: schema_uri: 'continue/1' exec: method: post uri: '/sys/links/transcludes/{message.original_event.page_title}' body: '{{globals.message}}' page_create: topic: mediawiki.page-create retry_on: status: - '5xx' - 404 # Sometimes occasional 404s happen because of the mysql replication lag, so retry match_not: - meta: domain: /\.wikidata\.org$/ page_namespace: 0 - meta: domain: /\.wikidata\.org$/ page_namespace: 120 exec: - method: post uri: '/sys/links/backlinks/{message.page_title}' body: '{{globals.message}}' on_backlinks_update: topic: change-prop.backlinks.resource-change limiters: blacklist: 'html:{message.meta.uri}' cases: - match: meta: schema_uri: 'resource_change/1' uri: '/https?:\/\/[^\/]+\/wiki\/(?<title>.+)/' tags: [ 'backlinks' ] exec: method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/html/{{match.meta.uri.title}}' headers: cache-control: no-cache if-unmodified-since: '{{date(message.root_event.dt)}}' x-restbase-mode: '{{message.tags[1]}}' query: redirect: false - match: meta: schema_uri: 'continue/1' exec: method: post uri: '/sys/links/backlinks/{message.original_event.page_title}' body: '{{globals.message}}' # ORES caching updates ores_cache: topic: mediawiki.revision-create concurrency: 10 ignore: status: - 503 exec: method: post uri: '/sys/ores/' query: postevent: true body: '{{globals.message}}' wikidata_description_on_edit: topic: mediawiki.revision-create match: meta: domain: www.wikidata.org page_namespace: 0 # It's impossible to modify a comment in wikidata while editing the entity. # TODO: This is a temp solution until we get a more general fragment support T148079 comment: '/wbeditentity|wbsetdescription|undo|restore/' rev_content_changed: true exec: method: post uri: '/sys/links/wikidata_descriptions' body: '{{globals.message}}' wikidata_description_on_undelete: topic: mediawiki.page-undelete match: meta: domain: www.wikidata.org page_namespace: 0 exec: method: post uri: '/sys/links/wikidata_descriptions' body: '{{globals.message}}' on_wikidata_description_change: topic: change-prop.wikidata.resource-change cases: - match: meta: uri: '/https:\/\/[^\/]+\/wiki\/(?<title>.+)/' domain: '/.*\.wikipedia.org$/' tags: [ 'wikidata' ] exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/summary/{{match.meta.uri.title}}' headers: cache-control: no-cache query: redirect: false - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/mobile-sections/{{match.meta.uri.title}}' headers: cache-control: no-cache query: redirect: false - match: meta: uri: '/https:\/\/[^\/]+\/wiki\/(?<title>.+)/' tags: [ 'wikidata' ] match_not: *others_match_not exec: - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/summary/{{match.meta.uri.title}}' headers: cache-control: no-cache query: redirect: false page_images: topic: mediawiki.page-properties-change # We don't support 'OR' in the match section, so workaround it by 2 cases with identical exec cases: - match: meta: domain: '/.*\.wikipedia.org$/' added_properties: page_image: '/.+/' exec: &page_images_wikipedia_rerender - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/summary/{message.page_title}' headers: cache-control: no-cache query: redirect: false - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/mobile-sections/{message.page_title}' headers: cache-control: no-cache query: redirect: false - match: added_properties: page_image: '/.+/' match_not: *others_match_not exec: &page_images_others_rerender - method: get uri: 'https://{{message.meta.domain}}/api/rest_v1/page/summary/{message.page_title}' headers: cache-control: no-cache query: redirect: false - match: meta: domain: '/.*\.wikipedia.org$/' removed_properties: page_image: '/.+/' exec: *page_images_wikipedia_rerender - match: removed_properties: page_image: '/.+/' match_not: *others_match_not exec: *page_images_others_rerender # Map tile cache invalidation purge_map_tile: topic: 'resource_change' match: tags: - tilerator exec: method: post uri: '/sys/purge/' body: - meta: uri: '{{message.meta.uri}}' num_workers: 0 logging: name: changeprop level: fatal streams: - type: stdout services: - name: changeprop module: hyperswitch conf: port: 7272 user_agent: SampleChangePropInstance spec: *spec