japanese-string-utils
Version:
The utils convert Japanese strings to other forms, such as Hiragana, Katakana, Full-width, Half-width, numeric and others.
177 lines (135 loc) • 4.46 kB
text/typescript
// 三百三三億五万千五五十五 => 33300051555
import { toNumeric } from "to-numeric";
interface normalizeMap {
[ key: string ]: string;
};
const ONE = '一';
const normalizeMap: normalizeMap = {
'0': '〇', '0': '〇', '零': '〇',
'1': ONE, '1': ONE, '壱': ONE, '壹': ONE, '弌': ONE,
'2': '二', '2': '二', '弐': '二', '貳': '二',
'3': '三', '3': '三', '参': '三', '參': '三',
'4': '四', '4': '四', '肆': '四',
'5': '五', '5': '五', '伍': '五',
'6': '六', '6': '六', '陸': '六',
'7': '七', '7': '七', '漆': '七', '柒': '七', '質': '七',
'8': '八', '8': '八', '捌': '八',
'9': '九', '9': '九', '玖': '九',
'拾': '十',
'廿': '二十',
'卅': '三十',
'丗': '三十',
'卌': '四十',
'佰': '百', '陌':'百',
'仟': '千', '阡':'千',
'萬': '万',
'.': '.', '。': '.', '・': '.', // 小数点
'ー': '-', '−': '-',
'+': '+',
};
const needsNormalizePattern = new RegExp( `[${ Object.keys( normalizeMap ).join( '|' ) }]`, 'g' );
interface basicNumberMap {
[ key: string ]: number;
};
interface digitMap {
[ key: string ]: number;
};
const basicNumber: basicNumberMap = {
'\u3007': 0, // 〇
'\u4E00': 1, // 一
'\u4E8C': 2, // 二
'\u4E09': 3, // 三
'\u56DB': 4, // 四
'\u4E94': 5, // 五
'\u516D': 6, // 六
'\u4E03': 7, // 七
'\u516B': 8, // 八
'\u4E5D': 9, // 九
};
const basicDigit: digitMap = {
'\u5341': 1e1, // 十 1e1,
'\u767E': 1e2, // 百 1e2,
'\u5343': 1e3, // 千 1e3,
};
const bigDigit: digitMap = {
'\u4E07': 1e4, // 万 1e4,
'\u5104': 1e8, // 億 1e8,
'\u5146': 1e12, // 兆 1e12,
'\u4EAC': 1e16, // 京 1e16,
};
const basicNumberPattern = new RegExp( `[${ Object.keys( basicNumber ).join( '|' ) }]` );
const basicNumberWithDotPattern = new RegExp( `[${ [...Object.keys( basicNumber ), '.' ].join( '|' ) }]` );
const basicDigitPattern = new RegExp( `[${ Object.keys( basicDigit ).join( '|' ) }]` );
const bigDigitPattern = new RegExp( `[${ Object.keys( bigDigit ).join( '|' ) }]` );
export function toNumericFromKanji( value: string ): string {
let normalizedValue = value.trim();
const matched = value.match( needsNormalizePattern );
matched && matched.forEach( ( char ) => {
normalizedValue = normalizedValue.replace( char, normalizeMap[ char ] );
} );
// サインを取得
const signMatched = normalizedValue.match( /^([+-])/ );
const sign = signMatched ? signMatched[ 1 ] : '';
// 処理できる文字以外を削除
normalizedValue = normalizedValue.replace(
new RegExp(
`[^${ [
'.',
...Object.keys( basicNumber ),
...Object.keys( basicDigit ),
...Object.keys( bigDigit ),
] }]`,
'g'
), ''
);
if ( normalizedValue === '' ) return '';
type Chunk = { letters: string[], digit: number };
const chunks: Chunk[] = [ {
letters: [],
digit: 1,
} ];
let currentBigDigit = 1;
for ( let i = normalizedValue.length - 1; i >= 0; i -- ) {
const currentChunk = chunks[ chunks.length - 1 ];
// 〇から九と小数点
if ( basicNumberWithDotPattern.test( normalizedValue[ i ] ) ) {
currentChunk.letters.unshift( normalizedValue[ i ] );
continue;
}
// 千、百、十
if ( basicDigitPattern.test( normalizedValue[ i ] ) ) {
const hasLeadNumber = normalizedValue[ i - 1 ] && basicNumberPattern.test( normalizedValue[ i - 1 ] );
const leadNumber = hasLeadNumber ? normalizedValue[ i - 1 ] : ONE;
chunks.push( {
letters: [ leadNumber ],
digit: basicDigit[ normalizedValue[ i ] ] * currentBigDigit,
} );
if ( hasLeadNumber ) i --; // 2文字使ったので、1つ余分に進める
continue;
}
// 億、万などの大きな桁の単体
if ( bigDigitPattern.test( normalizedValue[ i ] ) ) {
currentBigDigit = bigDigit[ normalizedValue[ i ] ];
chunks.push( {
letters: [],
digit: currentBigDigit,
} );
continue;
}
}
const numbers = chunks.reduce( ( acc, current ) => {
const letters = current.letters.join( '' ) || '0';
const numbers = + toNumeric( letters.split( '' ).map( ( char ) => {
return basicNumber[ char ] !== undefined ? basicNumber[ char ] : char;
} ).join( '' ) );
return acc + numbers * current.digit;
}, 0 );
return `${ sign }${ numbers }`;
}
toNumericFromKanji.validLetters = [ ...new Set( [
",",
...Object.entries( normalizeMap ).flat(),
...Object.keys(basicNumber),
...Object.keys(basicDigit),
...Object.keys(bigDigit),
] ) ];