@tensorflow/tfjs-layers
Version:
TensorFlow layers API in JavaScript
142 lines • 18.8 kB
JavaScript
/**
* @license
* Copyright 2023 Google LLC.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================================
*/
/**
* GPT-2 preprocessor layer.
*/
/* Original source: keras-nlp/models/gpt2/gpt2_preprocessor.py */
import { serialization, tidy } from '@tensorflow/tfjs-core';
import { Preprocessor } from '../preprocessor';
import { GPT2Tokenizer } from './gpt2_tokenizer';
import { StartEndPacker } from '../../preprocessing/start_end_packer';
import { ValueError } from '../../../../errors';
export function packXYSampleWeight(x, y, sampleWeight) {
if (y === undefined) {
return x;
}
else if (sampleWeight === undefined) {
return [x, y];
}
else {
return [x, y, sampleWeight];
}
}
/**
* GPT2 preprocessing layer which tokenizes and packs inputs.
*
* This preprocessing layer will do 2 things:
*
* - Tokenize the inputs using the `tokenizer`.
* - Construct a dictionary with keys `"tokenIds"`, `"paddingMask"`, that can
* be passed directly to a `GPT2Backbone`.
*
* The call method of this layer accepts three arguments, `x`, `y`, and
* `sampleWeight`. `x` can be a string or tensor representing a single
* segment, a list of strings representing a batch of single segments,
* or a list of tensors representing multiple segments to be packed together.
* `y` and `sampleWeight` are both optional, can have any format, and will be
* passed through unaltered.
*
* `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is
* mainly used for generation tasks. For tasks having multi-segment inputs
* like "glue/mnli", please use a model designed for classification purposes
* such as BERT or RoBERTa.
*
* Examples:
*
* Directly calling the layer on data.
* ```js
* const features = ['a quick fox.', 'a fox quick.'];
* const vocabulary =
* new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]);
* const merges =
* ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox'];
* const tokenizer = GPT2Tokenizer({vocabulary, merges});
*
* const preprocessor = GPT2Preprocessor({tokenizer});
* preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print();
* ```
*/
class GPT2Preprocessor extends Preprocessor {
constructor(args) {
var _a, _b, _c;
super(args);
this.tokenizer = args.tokenizer;
this.sequenceLength = (_a = args.sequenceLength) !== null && _a !== void 0 ? _a : 1024;
this.addStartToken = (_b = args.addStartToken) !== null && _b !== void 0 ? _b : true;
this.addEndToken = (_c = args.addEndToken) !== null && _c !== void 0 ? _c : true;
const gpt2Tokenizer = this.tokenizer;
this.packer = new StartEndPacker({
startValue: gpt2Tokenizer.startTokenId,
endValue: gpt2Tokenizer.endTokenId,
padValue: gpt2Tokenizer.padTokenId,
sequenceLength: this.sequenceLength,
});
}
getConfig() {
const config = {
sequenceLength: this.sequenceLength,
addStartToken: this.addStartToken,
addEndToken: this.addEndToken,
};
const baseConfig = super.getConfig();
Object.assign(config, baseConfig);
return config;
}
call(inputs, kwargs) {
return this.callAndReturnPaddingMask(inputs, kwargs).tokenIds;
}
callAndReturnPaddingMask(inputs, kwargs) {
return tidy(() => {
var _a;
if (inputs instanceof Array) {
if (inputs.length !== 1) {
throw new ValueError('GPT2 requires each input feature to contain only ' +
`one segment, but received ${inputs.length}. If you are using ` +
'GPT2 for a multi-segment classification task, please refer to ' +
'classification models like BERT or RoBERTa.');
}
inputs = inputs[0];
}
const sequenceLength = (_a = kwargs.sequenceLength) !== null && _a !== void 0 ? _a : this.sequenceLength;
const [tokenIds, paddingMask] = this.packer.callAndReturnPaddingMask(this.tokenizer.call(inputs), {
sequenceLength,
addStartValue: this.addStartToken,
addEndValue: this.addEndToken
});
return {
tokenIds: tokenIds,
paddingMask: paddingMask
};
});
}
/**
* Calls the layer and returns extra information like the paddingMask used to
* pack the sequence, the label data, and the sample weights used.
*/
callAndPackArgs(inputs, kwargs) {
const x = this.callAndReturnPaddingMask(inputs, kwargs);
return packXYSampleWeight(x, kwargs.y, kwargs.sampleWeight);
}
static tokenizerCls(cls) {
return GPT2Tokenizer;
}
}
/** @nocollapse */
GPT2Preprocessor.className = 'GPT2Preprocessor';
export { GPT2Preprocessor };
serialization.registerClass(GPT2Preprocessor);
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"gpt2_preprocessor.js","sourceRoot":"","sources":["../../../../../../../../../tfjs-layers/src/layers/nlp/models/gpt2/gpt2_preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH;;GAEG;AAEH,iEAAiE;AACjE,OAAO,EAAoC,aAAa,EAAE,IAAI,EAAE,MAAM,uBAAuB,CAAC;AAG9F,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AACtE,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AA8ChD,MAAM,UAAU,kBAAkB,CAChC,CAAiB,EAAE,CAAU,EAAE,YAAqB;IAKpD,IAAI,CAAC,KAAK,SAAS,EAAE;QACnB,OAAO,CAAC,CAAC;KACV;SAAM,IAAI,YAAY,KAAK,SAAS,EAAE;QACrC,OAAO,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;KACf;SAAM;QACL,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,YAAY,CAAC,CAAC;KAC7B;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,MAAa,gBAAiB,SAAQ,YAAY;IAShD,YAAY,IAA0B;;QACpC,KAAK,CAAC,IAAI,CAAC,CAAC;QACZ,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;QAChC,IAAI,CAAC,cAAc,GAAG,MAAA,IAAI,CAAC,cAAc,mCAAI,IAAI,CAAC;QAClD,IAAI,CAAC,aAAa,GAAG,MAAA,IAAI,CAAC,aAAa,mCAAI,IAAI,CAAC;QAChD,IAAI,CAAC,WAAW,GAAG,MAAA,IAAI,CAAC,WAAW,mCAAI,IAAI,CAAC;QAE5C,MAAM,aAAa,GAAG,IAAI,CAAC,SAA0B,CAAC;QACtD,IAAI,CAAC,MAAM,GAAG,IAAI,cAAc,CAAC;YAC/B,UAAU,EAAE,aAAa,CAAC,YAAY;YACtC,QAAQ,EAAE,aAAa,CAAC,UAAU;YAClC,QAAQ,EAAE,aAAa,CAAC,UAAU;YAClC,cAAc,EAAE,IAAI,CAAC,cAAc;SACpC,CAAC,CAAC;IACL,CAAC;IAEQ,SAAS;QAChB,MAAM,MAAM,GAAG;YACb,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,aAAa,EAAE,IAAI,CAAC,aAAa;YACjC,WAAW,EAAE,IAAI,CAAC,WAAW;SAC9B,CAAC;QACF,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IAEQ,IAAI,CACX,MAAuB,EAAE,MAA+B;QACxD,OAAO,IAAI,CAAC,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC;IAChE,CAAC;IAEO,wBAAwB,CAC9B,MAAuB,EACvB,MAA+B;QAE/B,OAAO,IAAI,CAAC,GAAG,EAAE;;YACf,IAAI,MAAM,YAAY,KAAK,EAAE;gBAC3B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE;oBACvB,MAAM,IAAI,UAAU,CAClB,mDAAmD;wBACnD,6BAA6B,MAAM,CAAC,MAAM,qBAAqB;wBAC/D,gEAAgE;wBAChE,6CAA6C,CAC9C,CAAC;iBACH;gBACD,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;aACpB;YAED,MAAM,cAAc,GAAG,MAAA,MAAM,CAAC,cAAc,mCAAI,IAAI,CAAC,cAAc,CAAC;YACpE,MAAM,CAAC,QAAQ,EAAE,WAAW,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,wBAAwB,CAClE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,EAC3B;gBACE,cAAc;gBACd,aAAa,EAAE,IAAI,CAAC,aAAa;gBACjC,WAAW,EAAE,IAAI,CAAC,WAAW;aAC9B,CACF,CAAC;YAEF,OAAO;gBACL,QAAQ,EAAE,QAAoB;gBAC9B,WAAW,EAAE,WAAuB;aACrC,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACH,eAAe,CAAC,MAAuB,EAAE,MAA+B;QAItE,MAAM,CAAC,GAAG,IAAI,CAAC,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxD,OAAO,kBAAkB,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,EAAE,MAAM,CAAC,YAAY,CAAC,CAAC;IAC9D,CAAC;IAED,MAAM,CAAU,YAAY,CAC1B,GAA6C;QAC7C,OAAO,aAAa,CAAC;IACvB,CAAC;;AAzFD,kBAAkB;AACF,0BAAS,GAAG,kBAAkB,CAAC;SAFpC,gBAAgB;AA4F7B,aAAa,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC","sourcesContent":["/**\n * @license\n * Copyright 2023 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n/**\n * GPT-2 preprocessor layer.\n */\n\n/* Original source: keras-nlp/models/gpt2/gpt2_preprocessor.py */\nimport { NamedTensorMap, Tensor, Tensor2D, serialization, tidy } from '@tensorflow/tfjs-core';\n\nimport { LayerArgs } from '../../../../engine/topology';\nimport { Preprocessor } from '../preprocessor';\nimport { GPT2Tokenizer } from './gpt2_tokenizer';\nimport { StartEndPacker } from '../../preprocessing/start_end_packer';\nimport { ValueError } from '../../../../errors';\n\nexport declare interface GPT2PreprocessorArgs extends LayerArgs {\n  /**\n   * A GPT2Tokenizer instance.\n   */\n  tokenizer: GPT2Tokenizer;\n\n  /**\n   * The length of the packed inputs.\n   * Defaults to 1024.\n   */\n  sequenceLength?: number;\n\n  /**\n   * If `true`, the preprocessor will prepend the tokenizer start token to each\n   * input sequence.\n   * Defaults to `true`.\n   */\n  addStartToken?: boolean;\n\n  /**\n   * If `true`, the preprocessor will prepend the tokenizer end token to each\n   * input sequence.\n   * Defaults to `true`.\n   */\n  addEndToken?: boolean;\n}\n\nexport declare interface GPT2PreprocessorOptions {\n  /**\n   * Any label data. Will be passed through unaltered.\n   */\n  y?: Tensor;\n\n  /**\n   * Any label weight data. Will be passed through unaltered.\n   */\n  sampleWeight?: Tensor;\n\n  /**\n   * Pass to override the configured `sequenceLength` of the layer.\n   */\n  sequenceLength?: number;\n}\n\nexport function packXYSampleWeight(\n  x: NamedTensorMap, y?: Tensor, sampleWeight?: Tensor):\n  NamedTensorMap\n  | [NamedTensorMap, Tensor]\n  | [NamedTensorMap, Tensor, Tensor] {\n\n  if (y === undefined) {\n    return x;\n  } else if (sampleWeight === undefined) {\n    return [x, y];\n  } else {\n    return [x, y, sampleWeight];\n  }\n}\n\n/**\n * GPT2 preprocessing layer which tokenizes and packs inputs.\n *\n * This preprocessing layer will do 2 things:\n *\n * - Tokenize the inputs using the `tokenizer`.\n * - Construct a dictionary with keys `\"tokenIds\"`, `\"paddingMask\"`, that can\n *     be passed directly to a `GPT2Backbone`.\n *\n * The call method of this layer accepts three arguments, `x`, `y`, and\n * `sampleWeight`. `x` can be a string or tensor representing a single\n * segment, a list of strings representing a batch of single segments,\n * or a list of tensors representing multiple segments to be packed together.\n * `y` and `sampleWeight` are both optional, can have any format, and will be\n * passed through unaltered.\n *\n * `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is\n * mainly used for generation tasks. For tasks having multi-segment inputs\n * like \"glue/mnli\", please use a model designed for classification purposes\n * such as BERT or RoBERTa.\n *\n * Examples:\n *\n * Directly calling the layer on data.\n * ```js\n * const features =  ['a quick fox.', 'a fox quick.'];\n * const vocabulary =\n *    new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]);\n * const merges =\n *    ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox'];\n * const tokenizer = GPT2Tokenizer({vocabulary, merges});\n *\n * const preprocessor = GPT2Preprocessor({tokenizer});\n * preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print();\n * ```\n */\nexport class GPT2Preprocessor extends Preprocessor {\n  /** @nocollapse */\n  static override className = 'GPT2Preprocessor';\n\n  protected readonly sequenceLength: number;\n  protected readonly addStartToken: boolean;\n  protected readonly addEndToken: boolean;\n  protected readonly packer: StartEndPacker;\n\n  constructor(args: GPT2PreprocessorArgs) {\n    super(args);\n    this.tokenizer = args.tokenizer;\n    this.sequenceLength = args.sequenceLength ?? 1024;\n    this.addStartToken = args.addStartToken ?? true;\n    this.addEndToken = args.addEndToken ?? true;\n\n    const gpt2Tokenizer = this.tokenizer as GPT2Tokenizer;\n    this.packer = new StartEndPacker({\n      startValue: gpt2Tokenizer.startTokenId,\n      endValue: gpt2Tokenizer.endTokenId,\n      padValue: gpt2Tokenizer.padTokenId,\n      sequenceLength: this.sequenceLength,\n    });\n  }\n\n  override getConfig(): serialization.ConfigDict {\n    const config = {\n      sequenceLength: this.sequenceLength,\n      addStartToken: this.addStartToken,\n      addEndToken: this.addEndToken,\n    };\n    const baseConfig = super.getConfig();\n    Object.assign(config, baseConfig);\n    return config;\n  }\n\n  override call(\n    inputs: Tensor|Tensor[], kwargs: GPT2PreprocessorOptions): Tensor|Tensor[] {\n    return this.callAndReturnPaddingMask(inputs, kwargs).tokenIds;\n  }\n\n  private callAndReturnPaddingMask(\n    inputs: Tensor|Tensor[],\n    kwargs: GPT2PreprocessorOptions\n  ): NamedTensorMap {\n    return tidy(() => {\n      if (inputs instanceof Array) {\n        if (inputs.length !== 1) {\n          throw new ValueError(\n            'GPT2 requires each input feature to contain only ' +\n            `one segment, but received ${inputs.length}. If you are using ` +\n            'GPT2 for a multi-segment classification task, please refer to ' +\n            'classification models like BERT or RoBERTa.'\n          );\n        }\n        inputs = inputs[0];\n      }\n\n      const sequenceLength = kwargs.sequenceLength ?? this.sequenceLength;\n      const [tokenIds, paddingMask] = this.packer.callAndReturnPaddingMask(\n        this.tokenizer.call(inputs),\n        {\n          sequenceLength,\n          addStartValue: this.addStartToken,\n          addEndValue: this.addEndToken\n        }\n      );\n\n      return {\n        tokenIds: tokenIds as Tensor2D,\n        paddingMask: paddingMask as Tensor2D\n      };\n    });\n  }\n\n  /**\n   * Calls the layer and returns extra information like the paddingMask used to\n   * pack the sequence, the label data, and the sample weights used.\n   */\n  callAndPackArgs(inputs: Tensor|Tensor[], kwargs: GPT2PreprocessorOptions):\n    NamedTensorMap\n    | [NamedTensorMap, Tensor]\n    | [NamedTensorMap, Tensor, Tensor] {\n    const x = this.callAndReturnPaddingMask(inputs, kwargs);\n    return packXYSampleWeight(x, kwargs.y, kwargs.sampleWeight);\n  }\n\n  static override tokenizerCls<T extends serialization.Serializable>(\n    cls: serialization.SerializableConstructor<T>) {\n    return GPT2Tokenizer;\n  }\n}\nserialization.registerClass(GPT2Preprocessor);\n"]}