Source code for pyams_catalog.nltk

#
# Copyright (c) 2008-2015 Thierry Florac <tflorac AT ulthar.net>
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#

__docformat__ = 'restructuredtext'


# import standard library

# import interfaces
from hypatia.text.interfaces import IPipelineElement

# import packages
import nltk
from pyams_i18n.language import BASE_LANGUAGES
from pyams_utils.unicode import translate_string
from zope.interface import implementer


[docs]@implementer(IPipelineElement) class NltkStemmedTextProcessor(object): """NLTK based text processor using stemmer""" def __init__(self, language='english'): if language in BASE_LANGUAGES: language = BASE_LANGUAGES[language].lower() self.language = language self.stemmer = nltk.stem.SnowballStemmer(language, ignore_stopwords=True)
[docs] def process(self, lst): result = [] for s in lst: translated = translate_string(s, keep_chars="'-").replace("'", ' ') tokens = nltk.word_tokenize(translated, self.language) result += [stem for stem in [self.stemmer.stem(token) for token in tokens if token not in self.stemmer.stopwords] if stem and (len(stem) > 1) and (stem not in self.stemmer.stopwords)] return result
[docs] def processGlob(self, lst): result = [] for s in lst: translated = translate_string(s, keep_chars="'-*?").replace("'", ' ') tokens = nltk.word_tokenize(translated, self.language) result += [stem for stem in [self.stemmer.stem(token) for token in tokens if token not in self.stemmer.stopwords] if stem and (len(stem) > 1) and (stem not in self.stemmer.stopwords)] return result
[docs]@implementer(IPipelineElement) class NltkFullTextProcessor(object): """NLTK based full text processor""" def __init__(self, language='english'): if language in BASE_LANGUAGES: language = BASE_LANGUAGES[language].lower() self.language = language
[docs] def process(self, lst): result = [] for s in lst: translated = translate_string(s, keep_chars="'-").replace("'", ' ') result += [token for token in nltk.word_tokenize(translated, self.language) if token and len(token) > 1] return result
[docs] def processGlob(self, lst): result = [] for s in lst: translated = translate_string(s, keep_chars="'-*?").replace("'", ' ') result += [token for token in nltk.word_tokenize(translated, self.language) if token and len(token) > 1] return result