Back to index

citadel  8.12
serv_expand_shorter_urls.c
Go to the documentation of this file.
00001 
00002 /*
00003  *
00004  * Copyright (c) 1998-2012 by the citadel.org team
00005  *
00006  *  This program is open source software; you can redistribute it and/or modify
00007  *  it under the terms of the GNU General Public License version 3.
00008  *  
00009  *  
00010  *
00011  *  This program is distributed in the hope that it will be useful,
00012  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *  GNU General Public License for more details.
00015  *
00016  *  
00017  *  
00018  *  
00019  */
00020 
00021 #include "sysdep.h"
00022 #include <stdlib.h>
00023 #include <unistd.h>
00024 #include <stdio.h>
00025 #include <termios.h>
00026 #include <fcntl.h>
00027 #include <signal.h>
00028 #include <pwd.h>
00029 #include <errno.h>
00030 #include <sys/types.h>
00031 #include <syslog.h>
00032 
00033 #if TIME_WITH_SYS_TIME
00034 # include <sys/time.h>
00035 # include <time.h>
00036 #else
00037 # if HAVE_SYS_TIME_H
00038 #  include <sys/time.h>
00039 # else
00040 #  include <time.h>
00041 # endif
00042 #endif
00043 #include <sys/wait.h>
00044 #include <ctype.h>
00045 #include <string.h>
00046 #include <limits.h>
00047 #include <sys/socket.h>
00048 #include <netinet/in.h>
00049 #include <arpa/inet.h>
00050 #include <assert.h>
00051 
00052 #include <libcitadel.h>
00053 #include "citadel.h"
00054 #include "server.h"
00055 #include "citserver.h"
00056 #include "support.h"
00057 #include "config.h"
00058 #include "control.h"
00059 #include "user_ops.h"
00060 #include "database.h"
00061 #include "msgbase.h"
00062 #include "internet_addressing.h"
00063 #include "genstamp.h"
00064 #include "domain.h"
00065 #include "ctdl_module.h"
00066 #include "locate_host.h"
00067 #include "citadel_dirs.h"
00068 
00069 #include "event_client.h"
00070 
00071 HashList *UrlShorteners = NULL;
00072 
00073 size_t GetLocationString( void *ptr, size_t size, size_t nmemb, void *userdata)
00074 {
00075 #define LOCATION "location"
00076        if (strncasecmp((char*)ptr, LOCATION, sizeof(LOCATION) - 1) == 0)
00077        {
00078               StrBuf *pURL = (StrBuf*) userdata;
00079               char *pch = (char*) ptr;
00080               char *pche;
00081               
00082               pche = pch + (size * nmemb);
00083               pch += sizeof(LOCATION);
00084               
00085               while (isspace(*pch) || (*pch == ':'))
00086                      pch ++;
00087 
00088               while (isspace(*pche) || (*pche == '\0'))
00089                      pche--;
00090               
00091               FlushStrBuf(pURL);
00092               StrBufPlain(pURL, pch, pche - pch + 1);   
00093        }
00094        return size * nmemb;
00095 }
00096 eNextState ShutdownLookuUrl(AsyncIO *IO)
00097 {
00098 //TOOD
00099        return eAbort;
00100 }
00101 eNextState TerminateLookupUrl(AsyncIO *IO)
00102 {
00103 //TOOD
00104        return eAbort;
00105 }
00106 eNextState TerminateLookupUrlDB(AsyncIO *IO)
00107 {
00108 //TOOD
00109        return eAbort;
00110 }
00111 eNextState LookupUrlResult(AsyncIO *IO)
00112 {
00113        return eTerminateConnection; 
00114 }
00115 
00116 int LookupUrl(StrBuf *ShorterUrlStr)
00117 {
00118        CURLcode sta;
00119        int rc = 0;
00120        CURL *chnd;
00121        AsyncIO *IO;
00122 
00123 
00124        IO = (AsyncIO*) malloc(sizeof(AsyncIO));
00125        memset(IO, 0, sizeof(AsyncIO));
00126        IO->CitContext = CloneContext(CC);
00127 
00128        ParseURL(&IO->ConnectMe, ShorterUrlStr, 80);
00129        CurlPrepareURL(IO->ConnectMe);
00130        if (! InitcURLIOStruct(IO, 
00131 //                     Ctx, 
00132                        NULL,
00133                        "Citadel RSS ShorterURL Expander",
00134                        LookupUrlResult, 
00135                        TerminateLookupUrl, 
00136                        TerminateLookupUrlDB, 
00137                        ShutdownLookuUrl))
00138        {
00139               syslog(LOG_ALERT, "Unable to initialize libcurl.\n");
00140               goto shutdown;
00141        }
00142        chnd = IO->HttpReq.chnd;
00143 
00144        OPT(SSL_VERIFYPEER, 0);
00145        OPT(SSL_VERIFYHOST, 0);
00146        OPT(FOLLOWLOCATION, 10);
00147 #ifdef CURLOPT_HTTP_CONTENT_DECODING
00148        OPT(HTTP_CONTENT_DECODING, 1);
00149        OPT(ENCODING, "");
00150 #endif 
00151        OPT(HEADERFUNCTION , GetLocationString);
00152        OPT(WRITEHEADER, ShorterUrlStr);
00153 
00154 
00155        if (server_shutting_down)
00156               goto shutdown ;
00157 
00158        QueueCurlContext(IO);
00159 
00160 shutdown:
00161 
00162               return rc;
00163 
00164 }
00165 
00166 
00167 
00168 void CrawlMessageForShorterUrls(HashList *pUrls, StrBuf *Message)
00169 {
00170        int nHits = 0;
00171        void *pv;
00172        int nShorter = 0;
00173        const char *pch;
00174        const char *pUrl;
00175        ConstStr *pCUrl;
00176 
00177        while (GetHash(UrlShorteners, IKEY(nShorter), &pv))
00178        {
00179               nShorter++;
00180               pch = ChrPtr(Message);
00181               pUrl = strstr(pch, ChrPtr((StrBuf*)pv));
00182               while ((pUrl != NULL) && (nHits < 99))
00183               {
00184                      pCUrl = malloc(sizeof(ConstStr));
00185 
00186                      pCUrl->Key = pUrl;
00187                      pch = pUrl + StrLength((StrBuf*)pv);
00188                      while (isalnum(*pch)||(*pch == '-')||(*pch == '/'))
00189                             pch++;
00190                      pCUrl->len = pch - pCUrl->Key;
00191 
00192                      Put(pUrls, IKEY(nHits), pCUrl, NULL);
00193                      nHits ++;
00194                      pUrl = strstr(pch, ChrPtr((StrBuf*)pv));
00195               }
00196        }
00197 }
00198 
00199 int SortConstStrByPosition(const void *Item1, const void *Item2)
00200 {
00201        const ConstStr *p1, *p2;
00202        p1 = (const ConstStr*) Item1;
00203        p2 = (const ConstStr*) Item2;
00204        if (p1->Key == p2->Key)
00205               return 0;
00206        if (p1->Key > p2->Key)
00207               return 1;
00208        return -1;
00209 }
00210 
00211 HashList *GetShorterUrls(StrBuf *Message)
00212 {
00213        HashList *pUrls;
00214        /* we just suspect URL shorteners to be inside of feeds from twitter
00215         * or other short content messages, so don't crawl through real blogs.
00216         */
00217        if (StrLength(Message) > 500)
00218               return NULL;
00219 
00220        pUrls = NewHash(1, Flathash);
00221        CrawlMessageForShorterUrls(pUrls, Message);
00222 
00223        if (GetCount(pUrls) > 0)
00224               return pUrls;
00225        else 
00226               return NULL;
00227 
00228 }
00229 
00230 void ExpandShortUrls(StrBuf *Message, HashList *pUrls, int Callback)
00231 {
00232        StrBuf *Shadow;
00233        ConstStr *pCUrl;
00234        const char *pch;
00235        const char *pche;
00236 
00237        StrBuf *ShorterUrlStr;
00238        HashPos *Pos;
00239        const char *Key;
00240        void *pv;
00241        long len;
00242        
00243        Shadow = NewStrBufPlain(NULL, StrLength(Message));
00244        SortByPayload (pUrls, SortConstStrByPosition);
00245               
00246        ShorterUrlStr = NewStrBufPlain(NULL, StrLength(Message));
00247               
00248        pch = ChrPtr(Message);
00249        pche = pch + StrLength(Message);
00250        Pos = GetNewHashPos(pUrls, 1);
00251        while (GetNextHashPos(pUrls, Pos, &len, &Key, &pv))
00252        {
00253               pCUrl = (ConstStr*) pv;
00254 
00255               if (pch != pCUrl->Key)
00256                      StrBufAppendBufPlain(Shadow, pch, pCUrl->Key - pch, 0);
00257                      
00258               StrBufPlain(ShorterUrlStr, CKEY(*pCUrl));
00259               if (LookupUrl(ShorterUrlStr))
00260               {
00261                      StrBufAppendBufPlain(Shadow, HKEY("<a href=\""), 0);
00262                      StrBufAppendBuf(Shadow, ShorterUrlStr, 0);
00263                      StrBufAppendBufPlain(Shadow, HKEY("\">"), 0);
00264                      StrBufAppendBuf(Shadow, ShorterUrlStr, 0);
00265                      StrBufAppendBufPlain(Shadow, HKEY("["), 0);
00266                      StrBufAppendBufPlain(Shadow, pCUrl->Key, pCUrl->len, 0);
00267                      StrBufAppendBufPlain(Shadow, HKEY("]</a>"), 0);
00268               }
00269               else
00270               {
00271                      StrBufAppendBufPlain(Shadow, HKEY("<a href=\""), 0);
00272                      StrBufAppendBufPlain(Shadow, pCUrl->Key, pCUrl->len, 0);
00273                      StrBufAppendBufPlain(Shadow, HKEY("\">"), 0);
00274                      StrBufAppendBufPlain(Shadow, pCUrl->Key, pCUrl->len, 0);
00275                      StrBufAppendBufPlain(Shadow, HKEY("</a>"), 0);
00276               }
00277               pch = pCUrl->Key + pCUrl->len + 1;
00278 
00279        }
00280        if (pch < pche)
00281               StrBufAppendBufPlain(Shadow, pch, pche - pch, 0);
00282        FlushStrBuf(Message);
00283        StrBufAppendBuf(Message, Shadow, 0);
00284 
00285        FreeStrBuf(&ShorterUrlStr);
00286        FreeStrBuf(&Shadow);
00287        DeleteHashPos(&Pos);
00288        
00289 
00290        DeleteHash(&pUrls);
00291 }
00292 
00293 void LoadUrlShorteners(void)
00294 {
00295        int i = 0;
00296        int fd;
00297        const char *POS = NULL;
00298        const char *Err = NULL;
00299        StrBuf *Content, *Line;
00300 
00301 
00302        UrlShorteners = NewHash(0, Flathash);
00303 
00304        fd = open(file_citadel_urlshorteners, 0);
00305 
00306        if (fd != 0)
00307        {
00308               Content = NewStrBufPlain(NULL, SIZ);
00309               Line = NewStrBuf();
00310               while (POS != StrBufNOTNULL)
00311               {
00312                      StrBufTCP_read_buffered_line_fast (Line, Content, &POS, &fd, 1, 1, &Err);
00313                      StrBufTrim(Line);
00314                      if ((*ChrPtr(Line) != '#') && (StrLength(Line) > 0))
00315                      {
00316                             Put(UrlShorteners, IKEY(i), Line, HFreeStrBuf);
00317                             i++;
00318                             Line = NewStrBuf();
00319                      }
00320                      else
00321                             FlushStrBuf(Line);
00322                      if (POS == NULL)
00323                             POS = StrBufNOTNULL;
00324               }
00325               FreeStrBuf(&Line);
00326               FreeStrBuf(&Content);
00327        }
00328        close(fd);
00329 }
00330 
00331 void shorter_url_cleanup(void)
00332 {
00333        DeleteHash(&UrlShorteners);
00334 }
00335 
00336 
00337 CTDL_MODULE_INIT(urldeshortener)
00338 {
00339        if (threading)
00340        {
00341               syslog(LOG_INFO, "%s\n", curl_version());
00342        }
00343        else 
00344        {
00345               LoadUrlShorteners ();
00346                 CtdlRegisterCleanupHook(shorter_url_cleanup);
00347        }
00348        return "UrlShortener";
00349 }