Back to index

indicator-appmenu  12.10.0
hudtoken.c
Go to the documentation of this file.
00001 /*
00002  * Copyright © 2012 Canonical Ltd.
00003  *
00004  * This program is free software: you can redistribute it and/or modify it
00005  * under the terms of the GNU General Public License version 3, as
00006  * published by the Free Software Foundation.
00007  *
00008  * This program is distributed in the hope that it will be useful, but
00009  * WITHOUT ANY WARRANTY; without even the implied warranties of
00010  * MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
00011  * PURPOSE.  See the GNU General Public License for more details.
00012  *
00013  * You should have received a copy of the GNU General Public License along
00014  * with this program.  If not, see <http://www.gnu.org/licenses/>.
00015  *
00016  * Author: Ryan Lortie <desrt@desrt.ca>
00017  */
00018 
00019 #include "hudtoken.h"
00020 
00021 #include "hudsettings.h"
00022 
00023 #include <string.h>
00024 
00025 struct _HudToken
00026 {
00027   guint     length;
00028   gchar    *original;
00029   gunichar *str;
00030 };
00031 
00032 /* This is actually one greater than the max-length.
00033  * Should be power-of-two for best performance.
00034  */
00035 #define TOKEN_LENGTH_LIMIT 32
00036 
00037 #if 0
00038 static void
00039 hud_token_distance_debug (const HudToken *haystack,
00040                           const HudToken *needle,
00041                           guint           d[TOKEN_LENGTH_LIMIT][TOKEN_LENGTH_LIMIT])
00042 {
00043   gint i, j;
00044 
00045   g_print (" ");
00046   for (j = 0; j <= haystack->length; j++)
00047     g_print ("%6lc", (wchar_t) (j ? haystack->str[j - 1] : ' '));
00048   g_print ("\n");
00049 
00050   for (i = 0; i <= needle->length; i++)
00051     {
00052       g_print ("%lc", (wchar_t) (i ? needle->str[i - 1] : ' '));
00053       for (j = 0; j <= haystack->length; j++)
00054         g_print ("%6u", d[i][j]);
00055       g_print ("\n");
00056     }
00057 }
00058 #endif
00059 
00060 #define MIN3(a,b,c) (MIN(MIN((a),(b)),(c)))
00061 
00062 /* Keeping in mind the fact that we're matching single words against
00063  * single words, we can expect the extremely vast majority of cases to
00064  * see needle and haystack both be less than 32 characters in length.
00065  *
00066  * For the common case we can avoid memory allocation by using a static
00067  * array.  By making the array a constant factor-of-two size we can
00068  * replace multiplication by a variable with bitshift by a constant.
00069  *
00070  * Tokens longer than this are expected to be so unlikely that we
00071  * simply deal with them by truncation during the normalisation phase.
00072  */
00073 guint
00074 hud_token_distance (const HudToken *haystack,
00075                     const HudToken *needle)
00076 {
00077   static guint d[TOKEN_LENGTH_LIMIT][TOKEN_LENGTH_LIMIT];
00078   gunichar h, n;
00079   gint result;
00080   gint i, j;
00081 
00082   g_assert (haystack->length < TOKEN_LENGTH_LIMIT && needle->length < TOKEN_LENGTH_LIMIT);
00083   g_assert (haystack->str[haystack->length] == 0);
00084   g_assert (needle->str[needle->length] == 0);
00085 
00086   /* This function only performs memory writes to 'd' and calls no other
00087    * functions.  No pointer to 'd' is ever leaked.  Hopefully the
00088    * compiler is clever enough to therefore realise that no other memory
00089    * will be modified during the running of this function and optimise
00090    * aggressively.
00091    */
00092 
00093   /* By convention we refer to "add" and "drop" in terms of "mistakes
00094    * the user made".  The user's token is the one in "needle".  To give
00095    * examples against the menu item "preferences", if the user typed:
00096    *
00097    *  - "prefereces": this has a dropped character
00098    *
00099    *  - "prefferences": this has an added character
00100    *
00101    *  - "prefefences": this has a swap
00102    *
00103    * We organise the matrix with each column (j) corresponding to a
00104    * character in the menu item text ('haystack') and each row (i)
00105    * corresponding to a character in the search token ('needle').
00106    *
00107    * We modify the Levenshtein algorithm in the following ways:
00108    *
00109    *  - configurable penalties for various mistakes (add, drop,
00110    *    swap).  This is done by replacing additions of '1' with
00111    *    additions of these configurable values.
00112    *
00113    *  - a lower penalties for drops that occur at the end of the token
00114    *    that the user typed.  For example, "prefer" would be given a
00115    *    lower penalty for those 5 missing letters than would occur if
00116    *    they were not missing from the end.
00117    *
00118    *    This is done by special-casing the last row (i == nlen) in the
00119    *    matrix, corresponding to the last character in the search
00120    *    token.  In this case, we calculate drops at a lower penalty.
00121    *
00122    * Implementing the first of these two changes is a simple tweak:
00123    * instead of adding '1', add the configurable value.
00124    *
00125    * Implementing the second one is somewhat more difficult: we could
00126    * modify the core algorithm, but that would violate the 'pureness' of
00127    * the dynamic programming approach.
00128    *
00129    * Instead, we wait until the 'pure' algorithm is done then instead of
00130    * only considering the result for the full menu item text (ie: the
00131    * number in the bottom right corner) we consider all prefixes of the
00132    * menu item text by scanning the entire bottom row (and adding a
00133    * penalty according to the number of characters removed).
00134    */
00135 
00136   /* http://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance */
00137   for (i = 0; i <= needle->length; i++)
00138     d[i][0] = i * hud_settings.add_penalty;
00139 
00140   for (j = 0; j <= haystack->length; j++)
00141     d[0][j] = j * hud_settings.drop_penalty;
00142 
00143   for (i = 1; (n = needle->str[i - 1]); i++)
00144     for (j = 1; (h = haystack->str[j - 1]); j++)
00145       if (n == h)
00146         d[i][j] = d[i - 1][j - 1];
00147 
00148       else
00149         d[i][j] = MIN3(d[i - 1][j - 1] + hud_settings.swap_penalty,
00150                        d[i    ][j - 1] + hud_settings.drop_penalty,
00151                        d[i - 1][j    ] + hud_settings.add_penalty);
00152 
00153   /* Noe we consider all prefixes of the menu item text to discover
00154    * which one gives us the best score.
00155    *
00156    * If we end up picking a result from a column other than the
00157    * rightmost, it will have had the correct multiple of the end-drop
00158    * penalty added to it by the time we're done.
00159    */
00160   result = d[--i][0];
00161   for (j = 1; j <= haystack->length; j++)
00162     result = MIN (d[i][j], result + hud_settings.end_drop_penalty);
00163 
00164   return result;
00165 }
00166 
00167 HudToken *
00168 hud_token_new (const gchar *str,
00169                gssize       length)
00170 {
00171   HudToken *token;
00172   gchar *normal;
00173   gchar *folded;
00174   glong items;
00175 
00176   token = g_slice_new (HudToken);
00177 
00178   token->original = g_strndup (str, length);
00179   normal = g_utf8_normalize (str, length, G_NORMALIZE_ALL);
00180   folded = g_utf8_casefold (normal, -1);
00181   token->str = g_utf8_to_ucs4_fast (folded, -1, &items);
00182   token->length = items;
00183   g_free (folded);
00184   g_free (normal);
00185 
00186   if (!(token->length < TOKEN_LENGTH_LIMIT))
00187     {
00188       token->length = 31;
00189       token->str[31] = 0;
00190     }
00191 
00192   return token;
00193 }
00194 
00195 void
00196 hud_token_free (HudToken *token)
00197 {
00198   g_free (token->original);
00199   g_free (token->str);
00200   g_slice_free (HudToken, token);
00201 }
00202 
00203 static const gchar *
00204 hud_token_get_original (HudToken *token)
00205 {
00206   return token->original;
00207 }
00208 
00209 struct _HudTokenList
00210 {
00211   HudToken **tokens;
00212   gint       length;
00213 };
00214 
00215 static HudTokenList *
00216 hud_token_list_new_consume_array (GPtrArray *array)
00217 {
00218   HudTokenList *list;
00219 
00220   list = g_slice_new (HudTokenList);
00221   list->tokens = (HudToken **) array->pdata;
00222   list->length = array->len;
00223 
00224   g_ptr_array_free (array, FALSE);
00225 
00226   return list;
00227 }
00228 
00229 #define SEPARATORS " .->"
00230 static void
00231 hud_token_list_add_string_to_array (GPtrArray   *array,
00232                                     const gchar *string)
00233 {
00234   while (*string)
00235     {
00236       /* strip separators */
00237       string += strspn (string, SEPARATORS);
00238 
00239       if (*string)
00240         {
00241           gint length;
00242 
00243           /* consume a token */
00244           length = strcspn (string, SEPARATORS);
00245           g_ptr_array_add (array, hud_token_new (string, length));
00246           string += length;
00247         }
00248     }
00249 }
00250 
00251 static void
00252 hud_token_list_add_string_list_to_array (GPtrArray     *array,
00253                                          HudStringList *list)
00254 {
00255   if (list == NULL)
00256     return;
00257 
00258   hud_token_list_add_string_list_to_array (array, hud_string_list_get_tail (list));
00259   hud_token_list_add_string_to_array (array, hud_string_list_get_head (list));
00260 }
00261 
00262 HudTokenList *
00263 hud_token_list_new_from_string (const gchar *string)
00264 {
00265   GPtrArray *array;
00266 
00267   array = g_ptr_array_new ();
00268   hud_token_list_add_string_to_array (array, string);
00269   return hud_token_list_new_consume_array (array);
00270 }
00271 
00272 HudTokenList *
00273 hud_token_list_new_from_string_list (HudStringList *string_list)
00274 {
00275   GPtrArray *array;
00276 
00277   array = g_ptr_array_new ();
00278   hud_token_list_add_string_list_to_array (array, string_list);
00279   return hud_token_list_new_consume_array (array);
00280 }
00281 
00282 void
00283 hud_token_list_free (HudTokenList *list)
00284 {
00285   gint i;
00286 
00287   for (i = 0; i < list->length; i++)
00288     hud_token_free (list->tokens[i]);
00289 
00290   g_free (list->tokens);
00291 
00292   g_slice_free (HudTokenList, list);
00293 }
00294 
00295 guint
00296 hud_token_list_distance (HudTokenList   *haystack,
00297                          HudTokenList   *needle,
00298                          const gchar  ***matches)
00299 {
00300   static guint d[32][32];
00301   gint i, j;
00302 
00303   if (needle->length > haystack->length)
00304     return G_MAXUINT;
00305 
00306   /* Simply refuse to deal with ridiculous situations.
00307    *
00308    * This only happens when the user has more than 32 words in their
00309    * search or the same happens in a menu item.
00310    */
00311   if (haystack->length > 32 || needle->length > 32)
00312     return G_MAXUINT;
00313 
00314   /* unroll the handling of the first needle term */
00315   {
00316     guint cost;
00317 
00318     /* unroll the handling of the first haystack term */
00319     cost = hud_token_distance (haystack->tokens[0], needle->tokens[0]);
00320     d[0][0] = cost;
00321 
00322     for (j = 0; j < haystack->length; j++)
00323       {
00324         guint take_cost;
00325 
00326         take_cost = hud_token_distance (haystack->tokens[j], needle->tokens[0]);
00327         cost = MIN (take_cost, cost + 1);
00328         d[0][j] = cost;
00329       }
00330   }
00331 
00332   /* if we have only one needle, this loop won't run at all */
00333   for (i = 1; i < needle->length; i++)
00334     {
00335       guint cost;
00336 
00337       /* unroll the handling of the first haystack term */
00338       cost = d[i - 1][i - 1] + hud_token_distance (haystack->tokens[i], needle->tokens[i]);
00339       d[i][i] = cost;
00340 
00341       for (j = i + 1; j < haystack->length; j++)
00342         {
00343           guint prev_cost;
00344 
00345           prev_cost = d[i - 1][j - 1];
00346           /* Only do checking of additional terms of it's possible that
00347            * we'll come in under the max distance AND beat the cost of
00348            * just dropping the term.
00349            *
00350            * hud_token_distance() could return zero so we need to try it
00351            * if:
00352            *
00353            *   - prev_cost is less than or equal to the max distance
00354            *     (because then our result could equal the max distance)
00355            *
00356            *   - prev_cost is less than or equal to the last cost
00357            *     Even if it's equal, skipping has a cost and a perfect
00358            *     match has no cost, so we need to try the match.
00359            */
00360           if (prev_cost <= hud_settings.max_distance && prev_cost <= cost)
00361             {
00362               guint take_cost;
00363 
00364               take_cost = prev_cost + hud_token_distance (haystack->tokens[j], needle->tokens[i]);
00365               cost = MIN (take_cost, cost + 1);
00366             }
00367           else
00368             cost = cost + 1;
00369 
00370           d[i][j] = cost;
00371         }
00372     }
00373 
00374   /* Discover which terms were matched */
00375   if (matches)
00376     {
00377       *matches = g_new (const gchar *, needle->length + 1);
00378 
00379       j = haystack->length - 1;
00380 
00381       for (i = needle->length - 1; i >= 0; i--)
00382         {
00383           while (j > i && d[i][j-1] == d[i][j] - 1)
00384             j--;
00385 
00386           (*matches)[i] = hud_token_get_original (haystack->tokens[j]);
00387           j--;
00388         }
00389 
00390       (*matches)[needle->length] = NULL;
00391     }
00392 
00393   return d[needle->length - 1][haystack->length - 1];
00394 }
00395 
00396 guint
00397 hud_token_list_get_length (HudTokenList *list)
00398 {
00399   return list->length;
00400 }