UNPKG

@tensorflow/tfjs-layers

Version:

TensorFlow layers API in JavaScript

142 lines 18.8 kB
/** * @license * Copyright 2023 Google LLC. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ /** * GPT-2 preprocessor layer. */ /* Original source: keras-nlp/models/gpt2/gpt2_preprocessor.py */ import { serialization, tidy } from '@tensorflow/tfjs-core'; import { Preprocessor } from '../preprocessor'; import { GPT2Tokenizer } from './gpt2_tokenizer'; import { StartEndPacker } from '../../preprocessing/start_end_packer'; import { ValueError } from '../../../../errors'; export function packXYSampleWeight(x, y, sampleWeight) { if (y === undefined) { return x; } else if (sampleWeight === undefined) { return [x, y]; } else { return [x, y, sampleWeight]; } } /** * GPT2 preprocessing layer which tokenizes and packs inputs. * * This preprocessing layer will do 2 things: * * - Tokenize the inputs using the `tokenizer`. * - Construct a dictionary with keys `"tokenIds"`, `"paddingMask"`, that can * be passed directly to a `GPT2Backbone`. * * The call method of this layer accepts three arguments, `x`, `y`, and * `sampleWeight`. `x` can be a string or tensor representing a single * segment, a list of strings representing a batch of single segments, * or a list of tensors representing multiple segments to be packed together. * `y` and `sampleWeight` are both optional, can have any format, and will be * passed through unaltered. * * `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is * mainly used for generation tasks. For tasks having multi-segment inputs * like "glue/mnli", please use a model designed for classification purposes * such as BERT or RoBERTa. * * Examples: * * Directly calling the layer on data. * ```js * const features = ['a quick fox.', 'a fox quick.']; * const vocabulary = * new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]); * const merges = * ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox']; * const tokenizer = GPT2Tokenizer({vocabulary, merges}); * * const preprocessor = GPT2Preprocessor({tokenizer}); * preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print(); * ``` */ class GPT2Preprocessor extends Preprocessor { constructor(args) { var _a, _b, _c; super(args); this.tokenizer = args.tokenizer; this.sequenceLength = (_a = args.sequenceLength) !== null && _a !== void 0 ? _a : 1024; this.addStartToken = (_b = args.addStartToken) !== null && _b !== void 0 ? _b : true; this.addEndToken = (_c = args.addEndToken) !== null && _c !== void 0 ? _c : true; const gpt2Tokenizer = this.tokenizer; this.packer = new StartEndPacker({ startValue: gpt2Tokenizer.startTokenId, endValue: gpt2Tokenizer.endTokenId, padValue: gpt2Tokenizer.padTokenId, sequenceLength: this.sequenceLength, }); } getConfig() { const config = { sequenceLength: this.sequenceLength, addStartToken: this.addStartToken, addEndToken: this.addEndToken, }; const baseConfig = super.getConfig(); Object.assign(config, baseConfig); return config; } call(inputs, kwargs) { return this.callAndReturnPaddingMask(inputs, kwargs).tokenIds; } callAndReturnPaddingMask(inputs, kwargs) { return tidy(() => { var _a; if (inputs instanceof Array) { if (inputs.length !== 1) { throw new ValueError('GPT2 requires each input feature to contain only ' + `one segment, but received ${inputs.length}. If you are using ` + 'GPT2 for a multi-segment classification task, please refer to ' + 'classification models like BERT or RoBERTa.'); } inputs = inputs[0]; } const sequenceLength = (_a = kwargs.sequenceLength) !== null && _a !== void 0 ? _a : this.sequenceLength; const [tokenIds, paddingMask] = this.packer.callAndReturnPaddingMask(this.tokenizer.call(inputs), { sequenceLength, addStartValue: this.addStartToken, addEndValue: this.addEndToken }); return { tokenIds: tokenIds, paddingMask: paddingMask }; }); } /** * Calls the layer and returns extra information like the paddingMask used to * pack the sequence, the label data, and the sample weights used. */ callAndPackArgs(inputs, kwargs) { const x = this.callAndReturnPaddingMask(inputs, kwargs); return packXYSampleWeight(x, kwargs.y, kwargs.sampleWeight); } static tokenizerCls(cls) { return GPT2Tokenizer; } } /** @nocollapse */ GPT2Preprocessor.className = 'GPT2Preprocessor'; export { GPT2Preprocessor }; serialization.registerClass(GPT2Preprocessor); //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"gpt2_preprocessor.js","sourceRoot":"","sources":["../../../../../../../../../tfjs-layers/src/layers/nlp/models/gpt2/gpt2_preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH;;GAEG;AAEH,iEAAiE;AACjE,OAAO,EAAoC,aAAa,EAAE,IAAI,EAAE,MAAM,uBAAuB,CAAC;AAG9F,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AACtE,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AA8ChD,MAAM,UAAU,kBAAkB,CAChC,CAAiB,EAAE,CAAU,EAAE,YAAqB;IAKpD,IAAI,CAAC,KAAK,SAAS,EAAE;QACnB,OAAO,CAAC,CAAC;KACV;SAAM,IAAI,YAAY,KAAK,SAAS,EAAE;QACrC,OAAO,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;KACf;SAAM;QACL,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,YAAY,CAAC,CAAC;KAC7B;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,MAAa,gBAAiB,SAAQ,YAAY;IAShD,YAAY,IAA0B;;QACpC,KAAK,CAAC,IAAI,CAAC,CAAC;QACZ,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;QAChC,IAAI,CAAC,cAAc,GAAG,MAAA,IAAI,CAAC,cAAc,mCAAI,IAAI,CAAC;QAClD,IAAI,CAAC,aAAa,GAAG,MAAA,IAAI,CAAC,aAAa,mCAAI,IAAI,CAAC;QAChD,IAAI,CAAC,WAAW,GAAG,MAAA,IAAI,CAAC,WAAW,mCAAI,IAAI,CAAC;QAE5C,MAAM,aAAa,GAAG,IAAI,CAAC,SAA0B,CAAC;QACtD,IAAI,CAAC,MAAM,GAAG,IAAI,cAAc,CAAC;YAC/B,UAAU,EAAE,aAAa,CAAC,YAAY;YACtC,QAAQ,EAAE,aAAa,CAAC,UAAU;YAClC,QAAQ,EAAE,aAAa,CAAC,UAAU;YAClC,cAAc,EAAE,IAAI,CAAC,cAAc;SACpC,CAAC,CAAC;IACL,CAAC;IAEQ,SAAS;QAChB,MAAM,MAAM,GAAG;YACb,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,aAAa,EAAE,IAAI,CAAC,aAAa;YACjC,WAAW,EAAE,IAAI,CAAC,WAAW;SAC9B,CAAC;QACF,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IAEQ,IAAI,CACX,MAAuB,EAAE,MAA+B;QACxD,OAAO,IAAI,CAAC,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC;IAChE,CAAC;IAEO,wBAAwB,CAC9B,MAAuB,EACvB,MAA+B;QAE/B,OAAO,IAAI,CAAC,GAAG,EAAE;;YACf,IAAI,MAAM,YAAY,KAAK,EAAE;gBAC3B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE;oBACvB,MAAM,IAAI,UAAU,CAClB,mDAAmD;wBACnD,6BAA6B,MAAM,CAAC,MAAM,qBAAqB;wBAC/D,gEAAgE;wBAChE,6CAA6C,CAC9C,CAAC;iBACH;gBACD,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;aACpB;YAED,MAAM,cAAc,GAAG,MAAA,MAAM,CAAC,cAAc,mCAAI,IAAI,CAAC,cAAc,CAAC;YACpE,MAAM,CAAC,QAAQ,EAAE,WAAW,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,wBAAwB,CAClE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,EAC3B;gBACE,cAAc;gBACd,aAAa,EAAE,IAAI,CAAC,aAAa;gBACjC,WAAW,EAAE,IAAI,CAAC,WAAW;aAC9B,CACF,CAAC;YAEF,OAAO;gBACL,QAAQ,EAAE,QAAoB;gBAC9B,WAAW,EAAE,WAAuB;aACrC,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACH,eAAe,CAAC,MAAuB,EAAE,MAA+B;QAItE,MAAM,CAAC,GAAG,IAAI,CAAC,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxD,OAAO,kBAAkB,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,EAAE,MAAM,CAAC,YAAY,CAAC,CAAC;IAC9D,CAAC;IAED,MAAM,CAAU,YAAY,CAC1B,GAA6C;QAC7C,OAAO,aAAa,CAAC;IACvB,CAAC;;AAzFD,kBAAkB;AACF,0BAAS,GAAG,kBAAkB,CAAC;SAFpC,gBAAgB;AA4F7B,aAAa,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC","sourcesContent":["/**\n * @license\n * Copyright 2023 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n/**\n * GPT-2 preprocessor layer.\n */\n\n/* Original source: keras-nlp/models/gpt2/gpt2_preprocessor.py */\nimport { NamedTensorMap, Tensor, Tensor2D, serialization, tidy } from '@tensorflow/tfjs-core';\n\nimport { LayerArgs } from '../../../../engine/topology';\nimport { Preprocessor } from '../preprocessor';\nimport { GPT2Tokenizer } from './gpt2_tokenizer';\nimport { StartEndPacker } from '../../preprocessing/start_end_packer';\nimport { ValueError } from '../../../../errors';\n\nexport declare interface GPT2PreprocessorArgs extends LayerArgs {\n  /**\n   * A GPT2Tokenizer instance.\n   */\n  tokenizer: GPT2Tokenizer;\n\n  /**\n   * The length of the packed inputs.\n   * Defaults to 1024.\n   */\n  sequenceLength?: number;\n\n  /**\n   * If `true`, the preprocessor will prepend the tokenizer start token to each\n   * input sequence.\n   * Defaults to `true`.\n   */\n  addStartToken?: boolean;\n\n  /**\n   * If `true`, the preprocessor will prepend the tokenizer end token to each\n   * input sequence.\n   * Defaults to `true`.\n   */\n  addEndToken?: boolean;\n}\n\nexport declare interface GPT2PreprocessorOptions {\n  /**\n   * Any label data. Will be passed through unaltered.\n   */\n  y?: Tensor;\n\n  /**\n   * Any label weight data. Will be passed through unaltered.\n   */\n  sampleWeight?: Tensor;\n\n  /**\n   * Pass to override the configured `sequenceLength` of the layer.\n   */\n  sequenceLength?: number;\n}\n\nexport function packXYSampleWeight(\n  x: NamedTensorMap, y?: Tensor, sampleWeight?: Tensor):\n  NamedTensorMap\n  | [NamedTensorMap, Tensor]\n  | [NamedTensorMap, Tensor, Tensor] {\n\n  if (y === undefined) {\n    return x;\n  } else if (sampleWeight === undefined) {\n    return [x, y];\n  } else {\n    return [x, y, sampleWeight];\n  }\n}\n\n/**\n * GPT2 preprocessing layer which tokenizes and packs inputs.\n *\n * This preprocessing layer will do 2 things:\n *\n * - Tokenize the inputs using the `tokenizer`.\n * - Construct a dictionary with keys `\"tokenIds\"`, `\"paddingMask\"`, that can\n *     be passed directly to a `GPT2Backbone`.\n *\n * The call method of this layer accepts three arguments, `x`, `y`, and\n * `sampleWeight`. `x` can be a string or tensor representing a single\n * segment, a list of strings representing a batch of single segments,\n * or a list of tensors representing multiple segments to be packed together.\n * `y` and `sampleWeight` are both optional, can have any format, and will be\n * passed through unaltered.\n *\n * `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is\n * mainly used for generation tasks. For tasks having multi-segment inputs\n * like \"glue/mnli\", please use a model designed for classification purposes\n * such as BERT or RoBERTa.\n *\n * Examples:\n *\n * Directly calling the layer on data.\n * ```js\n * const features =  ['a quick fox.', 'a fox quick.'];\n * const vocabulary =\n *    new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]);\n * const merges =\n *    ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox'];\n * const tokenizer = GPT2Tokenizer({vocabulary, merges});\n *\n * const preprocessor = GPT2Preprocessor({tokenizer});\n * preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print();\n * ```\n */\nexport class GPT2Preprocessor extends Preprocessor {\n  /** @nocollapse */\n  static override className = 'GPT2Preprocessor';\n\n  protected readonly sequenceLength: number;\n  protected readonly addStartToken: boolean;\n  protected readonly addEndToken: boolean;\n  protected readonly packer: StartEndPacker;\n\n  constructor(args: GPT2PreprocessorArgs) {\n    super(args);\n    this.tokenizer = args.tokenizer;\n    this.sequenceLength = args.sequenceLength ?? 1024;\n    this.addStartToken = args.addStartToken ?? true;\n    this.addEndToken = args.addEndToken ?? true;\n\n    const gpt2Tokenizer = this.tokenizer as GPT2Tokenizer;\n    this.packer = new StartEndPacker({\n      startValue: gpt2Tokenizer.startTokenId,\n      endValue: gpt2Tokenizer.endTokenId,\n      padValue: gpt2Tokenizer.padTokenId,\n      sequenceLength: this.sequenceLength,\n    });\n  }\n\n  override getConfig(): serialization.ConfigDict {\n    const config = {\n      sequenceLength: this.sequenceLength,\n      addStartToken: this.addStartToken,\n      addEndToken: this.addEndToken,\n    };\n    const baseConfig = super.getConfig();\n    Object.assign(config, baseConfig);\n    return config;\n  }\n\n  override call(\n    inputs: Tensor|Tensor[], kwargs: GPT2PreprocessorOptions): Tensor|Tensor[] {\n    return this.callAndReturnPaddingMask(inputs, kwargs).tokenIds;\n  }\n\n  private callAndReturnPaddingMask(\n    inputs: Tensor|Tensor[],\n    kwargs: GPT2PreprocessorOptions\n  ): NamedTensorMap {\n    return tidy(() => {\n      if (inputs instanceof Array) {\n        if (inputs.length !== 1) {\n          throw new ValueError(\n            'GPT2 requires each input feature to contain only ' +\n            `one segment, but received ${inputs.length}. If you are using ` +\n            'GPT2 for a multi-segment classification task, please refer to ' +\n            'classification models like BERT or RoBERTa.'\n          );\n        }\n        inputs = inputs[0];\n      }\n\n      const sequenceLength = kwargs.sequenceLength ?? this.sequenceLength;\n      const [tokenIds, paddingMask] = this.packer.callAndReturnPaddingMask(\n        this.tokenizer.call(inputs),\n        {\n          sequenceLength,\n          addStartValue: this.addStartToken,\n          addEndValue: this.addEndToken\n        }\n      );\n\n      return {\n        tokenIds: tokenIds as Tensor2D,\n        paddingMask: paddingMask as Tensor2D\n      };\n    });\n  }\n\n  /**\n   * Calls the layer and returns extra information like the paddingMask used to\n   * pack the sequence, the label data, and the sample weights used.\n   */\n  callAndPackArgs(inputs: Tensor|Tensor[], kwargs: GPT2PreprocessorOptions):\n    NamedTensorMap\n    | [NamedTensorMap, Tensor]\n    | [NamedTensorMap, Tensor, Tensor] {\n    const x = this.callAndReturnPaddingMask(inputs, kwargs);\n    return packXYSampleWeight(x, kwargs.y, kwargs.sampleWeight);\n  }\n\n  static override tokenizerCls<T extends serialization.Serializable>(\n    cls: serialization.SerializableConstructor<T>) {\n    return GPT2Tokenizer;\n  }\n}\nserialization.registerClass(GPT2Preprocessor);\n"]}