Source code for pyams_catalog.nltk

#
# Copyright (c) 2008-2015 Thierry Florac <tflorac AT ulthar.net>
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#

__docformat__ = 'restructuredtext'


# import standard library

# import interfaces
from hypatia.text.interfaces import IPipelineElement

# import packages
import nltk
from pyams_i18n.language import BASE_LANGUAGES
from pyams_utils.unicode import translate_string
from zope.interface import implementer


[docs]@implementer(IPipelineElement)
class NltkStemmedTextProcessor(object):
    """NLTK based text processor using stemmer"""

    def __init__(self, language='english'):
        if language in BASE_LANGUAGES:
            language = BASE_LANGUAGES[language].lower()
        self.language = language
        self.stemmer = nltk.stem.SnowballStemmer(language, ignore_stopwords=True)

[docs]    def process(self, lst):
        result = []
        for s in lst:
            translated = translate_string(s, keep_chars="'-").replace("'", ' ')
            tokens = nltk.word_tokenize(translated, self.language)
            result += [stem for stem in [self.stemmer.stem(token) for token in tokens
                                         if token not in self.stemmer.stopwords]
                       if stem and (len(stem) > 1) and (stem not in self.stemmer.stopwords)]
        return result

[docs]    def processGlob(self, lst):
        result = []
        for s in lst:
            translated = translate_string(s, keep_chars="'-*?").replace("'", ' ')
            tokens = nltk.word_tokenize(translated, self.language)
            result += [stem for stem in [self.stemmer.stem(token) for token in tokens
                                         if token not in self.stemmer.stopwords]
                       if stem and (len(stem) > 1) and (stem not in self.stemmer.stopwords)]
        return result


[docs]@implementer(IPipelineElement)
class NltkFullTextProcessor(object):
    """NLTK based full text processor"""

    def __init__(self, language='english'):
        if language in BASE_LANGUAGES:
            language = BASE_LANGUAGES[language].lower()
        self.language = language

[docs]    def process(self, lst):
        result = []
        for s in lst:
            translated = translate_string(s, keep_chars="'-").replace("'", ' ')
            result += [token for token in nltk.word_tokenize(translated, self.language)
                       if token and len(token) > 1]
        return result

[docs]    def processGlob(self, lst):
        result = []
        for s in lst:
            translated = translate_string(s, keep_chars="'-*?").replace("'", ' ')
            result += [token for token in nltk.word_tokenize(translated, self.language)
                       if token and len(token) > 1]
        return result