Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 14 additions & 17 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ pipeline {
environment {
AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0'
EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-25-0'
EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-27-26-0'
ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0'
ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0'
HI_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-27-26-0'
FR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-07-25-0'
HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-01-26-1'
Expand All @@ -27,8 +28,8 @@ pipeline {
HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-27-26-0'
KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-03-25-0'
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down Expand Up @@ -144,21 +145,12 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=es_en --text="ciento uno " --cache_dir ${ES_EN_TN_CACHE}'
}
}
}
}

stage('L0: Create AR TN/ITN Grammars') {
when {
anyOf {
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
stage('L0: Codeswitched HI/EN ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi_en --text="एक" --cache_dir ${HI_EN_TN_CACHE}'
}
}
}
failFast true
parallel {
stage('L0: AR TN grammars') {
stage('L0: FR TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ar --text="2" --cache_dir ${AR_TN_CACHE}'
}
Expand Down Expand Up @@ -409,6 +401,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/es_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${ES_EN_TN_CACHE}'
}
}
stage('L1: Run all Codeswitched HI/EN TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hi_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${HI_EN_TN_CACHE}'
}
}
stage('L1: Run all AR TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ar/ -m "not pleasefixme" --cpu --tn_cache_dir ${AR_TN_CACHE}'
Expand Down
17 changes: 17 additions & 0 deletions nemo_text_processing/inverse_text_normalization/hi_en/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ClassifyFst
from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import VerbalizeFst
from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize_final import VerbalizeFinalFst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst
from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst
from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst
from nemo_text_processing.inverse_text_normalization.en.taggers.electronic import ElectronicFst as EnElectronicFst
from nemo_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst as EnMeasureFst
from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst as EnMoneyFst
from nemo_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst as EnOrdinalFst
from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst as EnPunctuationFst
from nemo_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst as EnTelephoneFst
from nemo_text_processing.inverse_text_normalization.en.taggers.time import TimeFst as EnTimeFst
from nemo_text_processing.inverse_text_normalization.en.taggers.whitelist import WhiteListFst as EnWhiteListFst
from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst as EnWordFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.date import DateFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.decimal import DecimalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.fraction import FractionFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
from nemo_text_processing.text_normalization.en.graph_utils import (
INPUT_LOWER_CASED,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.utils.logging import logger


class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.

Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
whitelist: path to a file with Hindi whitelist replacements. If None, defaults to the Hindi whitelist at
nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv
en_whitelist: path to a file with English whitelist replacements. If None, defaults to the English whitelist at
nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
input_case: accepting either "lower_cased" or "cased" input.
"""

def __init__(
self,
cache_dir: str = None,
overwrite_cache: bool = False,
whitelist: str = None,
en_whitelist: str = None,
input_case: str = INPUT_LOWER_CASED,
):
super().__init__(name="tokenize_and_classify", kind="classify")

far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, f"hi_en_itn_{input_case}.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logger.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logger.info(f"Creating ClassifyFst grammars.")

cardinal = CardinalFst()
cardinal_graph = cardinal.fst

ordinal = OrdinalFst(cardinal)
ordinal_graph = ordinal.fst

decimal = DecimalFst(cardinal)
decimal_graph = decimal.fst

fraction = FractionFst(cardinal)
fraction_graph = fraction.fst

measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
date_graph = DateFst(cardinal, ordinal).fst
word_graph = WordFst().fst
time_graph = TimeFst(cardinal).fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
whitelist_graph = WhiteListFst(input_file=whitelist).fst
punct_graph = PunctuationFst().fst
telephone_graph = TelephoneFst(cardinal).fst

en_cardinal = EnCardinalFst(input_case=input_case)
en_cardinal_graph = en_cardinal.fst

en_ordinal = EnOrdinalFst(cardinal=en_cardinal, input_case=input_case)
en_ordinal_graph = en_ordinal.fst

en_decimal = EnDecimalFst(cardinal=en_cardinal, input_case=input_case)
en_decimal_graph = en_decimal.fst

en_measure_graph = EnMeasureFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst
en_date_graph = EnDateFst(ordinal=en_ordinal, input_case=input_case).fst
en_word_graph = EnWordFst().fst
en_time_graph = EnTimeFst(input_case=input_case).fst
en_money_graph = EnMoneyFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst
en_whitelist_graph = EnWhiteListFst(input_file=en_whitelist, input_case=input_case).fst
en_punct_graph = EnPunctuationFst().fst
en_electronic_graph = EnElectronicFst(input_case=input_case).fst
en_telephone_graph = EnTelephoneFst(cardinal=en_cardinal, input_case=input_case).fst

classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(en_whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(en_time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(en_date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.09)
| pynutil.add_weight(en_decimal_graph, 1.09)
| pynutil.add_weight(fraction_graph, 1.09)
| pynutil.add_weight(measure_graph, 1.6)
| pynutil.add_weight(en_measure_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.6)
| pynutil.add_weight(en_cardinal_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.6)
| pynutil.add_weight(en_ordinal_graph, 1.09)
| pynutil.add_weight(money_graph, 1.6)
| pynutil.add_weight(en_money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.6)
| pynutil.add_weight(en_telephone_graph, 1.1)
| pynutil.add_weight(en_electronic_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
| pynutil.add_weight(en_word_graph, 120)
)

punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
en_punct = (
pynutil.insert("tokens { ") + pynutil.add_weight(en_punct_graph, weight=1.3) + pynutil.insert(" }")
)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ token
+ pynini.closure(pynutil.insert(" ") + punct | en_punct)
)

graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
graph = delete_space + graph + delete_space

self.fst = graph.optimize()

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logger.info(f"ClassifyFst grammars are saved to {far_file}.")
27 changes: 27 additions & 0 deletions nemo_text_processing/inverse_text_normalization/hi_en/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os


def get_abs_path(rel_path):
"""
Get absolute path

Args:
rel_path: relative path to this file

Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading
Loading