You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello, I Have an issue that when I try to import import dadmatools.pipeline.language as language in my local machine I face this error: UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 220: character maps to undefined
How can I fix this?
This is the full trace of the error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In[34], line 1
----> 1 import dadmatools.pipeline.language as language
3 # here lemmatizer and pos tagger will be loaded
4 # as tokenizer is the default tool, it will be loaded as well even without calling
5 pips = 'lem'
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\__init__.py:1
----> 1 from .language import Pipeline
2 from .tpipeline import TPipeline
3 from .language import supported_langs, langwithner, remove_with_path
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\language.py:4
1 from typing import List
3 from .config import config as master_config
----> 4 from .informal2formal.main import Informal2Formal
5 from .models.base_models import Multilingual_Embedding
6 from .models.classifiers import TokenizerClassifier, PosDepClassifier, NERClassifier, SentenceClassifier, \
7 KasrehClassifier
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\informal2formal\main.py:6
4 import yaml
5 from .download_utils import download_dataset
----> 6 import dadmatools.pipeline.informal2formal.utils as utils
7 from .formality_transformer import FormalityTransformer
8 from dadmatools.pipeline.persian_tokenization.tokenizer import SentenceTokenizer
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\pipeline\informal2formal\utils.py:10
7 from dadmatools.pipeline.persian_tokenization.tokenizer import WordTokenizer
8 from dadmatools.normalizer import Normalizer
---> 10 normalizer = Normalizer()
11 tokenizer = WordTokenizer('cache/dadmatools')
12 # tokenizer = WordTokenizer(separate_emoji=True)
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\dadmatools\normalizer.py:32, in Normalizer.__init__(self, full_cleaning, unify_chars, refine_punc_spacing, remove_extra_space, remove_puncs, remove_html, remove_stop_word, replace_email_with, replace_number_with, replace_url_with, replace_mobile_number_with, replace_emoji_with, replace_home_number_with)
30 self.remove_puncs = remove_puncs
31 self.remove_stop_word = remove_stop_word
---> 32 self.STOPWORDS = open(prefix+save_dir+'stopwords-fa.py').read().splitlines()
33 self.PUNCS = string.punctuation.replace('<', '').replace('>', '') + '،؟'
34 if full_cleaning:
File c:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py:23, in IncrementalDecoder.decode(self, input, final)
22 def decode(self, input, final=False):
---> 23 return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 220: character maps to <undefined>
The text was updated successfully, but these errors were encountered:
Hello, I Have an issue that when I try to import import dadmatools.pipeline.language as language in my local machine I face this error:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 220: character maps to undefined
How can I fix this?
This is the full trace of the error:
The text was updated successfully, but these errors were encountered: