From 16d109ee80c991bb585e608151330edc710c2a47 Mon Sep 17 00:00:00 2001 From: dmiro Date: Tue, 27 Jan 2015 15:15:08 +0100 Subject: [PATCH 1/3] decode error stop_words = [line.strip().decode('utf-8') for line in language_file.readlines()] Strip() return a copy of the string with leading and trailing whitespace characters removed. But if the string contains non-ascii characters, Strip() causes a UnicodeDecodeError error (eg UnicodeDecodeError: 'utf8' codec can not decode byte 0xc3 in position 34: unexpected end of data). The workaround is to reorder the call: stop_words = [line.decode('utf-8').strip() for line in language_file.readlines()] --- stop_words/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stop_words/__init__.py b/stop_words/__init__.py index 42a2ce2..b351b0f 100644 --- a/stop_words/__init__.py +++ b/stop_words/__init__.py @@ -58,7 +58,7 @@ def get_stop_words(language): try: language_filename = '{0}{1}.txt'.format(STOP_WORDS_DIR, language) with open(language_filename, 'rb') as language_file: - stop_words = [line.strip().decode('utf-8') + stop_words = [line.decode('utf-8').strip() for line in language_file.readlines()] except IOError: raise StopWordError( From ff4f07d0c1ffbf2368e6e4e7fb0f1fc1b74b23fd Mon Sep 17 00:00:00 2001 From: dmiro Date: Tue, 27 Jan 2015 15:17:54 +0100 Subject: [PATCH 2/3] add catalan language to LANGUAGE_MAPPING --- stop_words/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stop_words/__init__.py b/stop_words/__init__.py index b351b0f..40a45de 100644 --- a/stop_words/__init__.py +++ b/stop_words/__init__.py @@ -7,6 +7,7 @@ LANGUAGE_MAPPING = { 'ar': 'arabic', + 'ca': 'catalan', 'da': 'danish', 'nl': 'dutch', 'en': 'english', From e5add47f7de32a6329d9af20ac52515f43eb9178 Mon Sep 17 00:00:00 2001 From: dmiro Date: Tue, 27 Jan 2015 15:19:26 +0100 Subject: [PATCH 3/3] Modified readme --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 63d1ad0..1e02813 100644 --- a/README.rst +++ b/README.rst @@ -51,6 +51,7 @@ Available languages ------------------- * Arabic +* Catalan * Danish * Dutch * English