Back to index

plone3  3.1.7
UnicodeSplitter.py
Go to the documentation of this file.
00001 ## Copyright (c) 2002, Infrae. All rights reserved.
00002 
00003 ## Redistribution and use in source and binary forms, with or without
00004 ## modification, are permitted provided that the following conditions are
00005 ## met:
00006 
00007 ##   1. Redistributions of source code must retain the above copyright
00008 ##      notice, this list of conditions and the following disclaimer.
00009 
00010 ##   2. Redistributions in binary form must reproduce the above copyright
00011 ##      notice, this list of conditions and the following disclaimer in
00012 ##      the documentation and/or other materials provided with the
00013 ##      distribution.
00014 
00015 ##   3. Neither the name of Infrae nor the names of its contributors may
00016 ##      be used to endorse or promote products derived from this software
00017 ##      without specific prior written permission.
00018 
00019 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00020 ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00021 ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00022 ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR
00023 ## CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024 ## EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025 ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00026 ## PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00027 ## LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00028 ## NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00029 ## SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 
00031 from utils import classImplements
00032 from Products.ZCTextIndex.ISplitter import ISplitter
00033 from Products.ZCTextIndex.PipelineFactory import element_factory
00034 
00035 import re
00036 enc = 'utf-8'
00037 
00038 class Splitter:
00039 
00040     __implements__ = ISplitter
00041 
00042     rx_L = re.compile(r"\w+", re.LOCALE)
00043     rxGlob_L = re.compile(r"\w+[\w*?]*", re.LOCALE)
00044 
00045     rx_U = re.compile(r"\w+", re.UNICODE)
00046     rxGlob_U = re.compile(r"\w+[\w*?]*", re.UNICODE)
00047 
00048     def process(self, lst):
00049         result = []
00050         for s in lst:
00051             # This is a hack to get the word splitting working with
00052             # non-unicode text.
00053             try:
00054                 if not isinstance(s, unicode):
00055                     s = unicode(s, enc)
00056             except (UnicodeDecodeError, TypeError):
00057                 # Fall back to locale aware splitter
00058                 result += self.rx_L.findall(s)
00059             else:
00060                 words = self.rx_U.findall(s)
00061                 result += [w.encode(enc) for w in words]
00062         return result
00063 
00064     def processGlob(self, lst):
00065         result = []
00066         for s in lst:
00067             # This is a hack to get the word splitting working with
00068             # non-unicode text.
00069             try:
00070                 if not isinstance(s, unicode):
00071                     s = unicode(s, enc)
00072             except (UnicodeDecodeError, TypeError):
00073                 # Fall back to locale aware splitter
00074                 result += self.rxGlob_L.findall(s)
00075             else:
00076                 words = self.rxGlob_U.findall(s)
00077                 result += [w.encode(enc) for w in words]
00078         return result
00079 
00080 classImplements(Splitter, Splitter.__implements__)
00081 
00082 try:
00083     element_factory.registerFactory('Word Splitter',
00084         'Unicode Whitespace splitter', Splitter)
00085 except ValueError:
00086     # In case the splitter is already registered, ValueError is raised
00087     pass
00088 
00089 class CaseNormalizer:
00090 
00091     def process(self, lst):
00092         result = []
00093         for s in lst:
00094             # This is a hack to get the normalizer working with
00095             # non-unicode text.
00096             try:
00097                 if not isinstance(s, unicode):
00098                     s = unicode(s, enc)
00099             except (UnicodeDecodeError, TypeError):
00100                 result.append(s.lower())
00101             else:
00102                 result.append(s.lower().encode(enc))
00103         return result
00104 
00105 try:
00106     element_factory.registerFactory('Case Normalizer',
00107         'Unicode Case Normalizer', CaseNormalizer)
00108 except ValueError:
00109     # In case the normalizer is already registered, ValueError is raised
00110     pass
00111