Back to index

moin  1.9.0~rc2
application_octet_stream.py
Go to the documentation of this file.
00001 # -*- coding: iso-8859-1 -*-
00002 """
00003     MoinMoin - binary file Filter
00004 
00005     Processes any binary file and extracts ASCII content from it.
00006 
00007     We ignore any file with a file extension on the blacklist, because
00008     we either can't handle it or it usually has no indexable content.
00009 
00010     Due to speed reasons, we only read the first maxread bytes from a file.
00011 
00012     For reducing the amount of trash, we only return words with
00013     length >= minwordlen.
00014 
00015     Depends on: nothing (pure python)
00016 
00017     @copyright: 2006 MoinMoin:ThomasWaldmann
00018     @license: GNU GPL, see COPYING for details.
00019 """
00020 
00021 maxread = 10000
00022 minwordlen = 4
00023 
00024 blacklist = ('.iso', '.nrg', # CD/DVD images
00025              '.zip', '.rar', '.lzh', '.lha',
00026              '.tar', '.gz', '.tgz', '.bz2', '.tb2', '.z',
00027              '.exe', '.com', '.dll', '.cab', '.msi', '.bin', # windows
00028              '.rpm', '.deb', # linux
00029              '.hqx', '.dmg', '.sit', # mac
00030              '.jar', '.class', # java
00031             )
00032 
00033 import os, string
00034 
00035 # builds a list of all characters:
00036 norm = string.maketrans('', '')
00037 
00038 # builds a list of all non-alphanumeric characters:
00039 non_alnum = string.translate(norm, norm, string.letters+string.digits)
00040 
00041 # translate table that replaces all non-alphanumeric by blanks:
00042 trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))
00043 
00044 def execute(indexobj, filename):
00045     fileext = os.path.splitext(filename)[1]
00046     if fileext.lower() in blacklist:
00047         return u''
00048     f = file(filename, "rb")
00049     data = f.read(maxread)
00050     f.close()
00051     data = data.translate(trans_nontext) # replace non-ascii by blanks
00052     data = data.split() # removes lots of blanks
00053     data = [s for s in data if len(s) >= minwordlen] # throw away too short stuff
00054     data = ' '.join(data)
00055     return data.decode('ascii')
00056