index/trav.c

   1 /*
   2  * Copyright (C) 1994-1999, Index Data
   3  * All rights reserved.
   4  * Sebastian Hammer, Adam Dickmeiss
   5  *
   6  * $Log: trav.c,v $
   7  * Revision 1.37  2002-02-20 17:30:01  adam
   8  * Work on new API. Locking system re-implemented
   9  *
  10  * Revision 1.36  1999/05/15 14:36:38  adam
  11  * Updated dictionary. Implemented "compression" of dictionary.
  12  *
  13  * Revision 1.35  1999/02/02 14:51:09  adam
  14  * Updated WIN32 code specific sections. Changed header.
  15  *
  16  * Revision 1.34  1998/06/08 14:43:14  adam
  17  * Added suport for EXPLAIN Proxy servers - added settings databasePath
  18  * and explainDatabase to facilitate this. Increased maximum number
  19  * of databases and attributes in one register.
  20  *
  21  * Revision 1.33  1998/01/12 15:04:08  adam
  22  * The test option (-s) only uses read-lock (and not write lock).
  23  *
  24  * Revision 1.32  1997/09/25 14:56:51  adam
  25  * Windows NT interface code to the stat call.
  26  *
  27  * Revision 1.31  1997/09/17 12:19:17  adam
  28  * Zebra version corresponds to YAZ version 1.4.
  29  * Changed Zebra server so that it doesn't depend on global common_resource.
  30  *
  31  * Revision 1.30  1997/09/09 13:38:09  adam
  32  * Partial port to WIN95/NT.
  33  *
  34  * Revision 1.29  1997/02/12 20:39:47  adam
  35  * Implemented options -f <n> that limits the log to the first <n>
  36  * records.
  37  * Changed some log messages also.
  38  *
  39  * Revision 1.28  1996/11/01 08:58:44  adam
  40  * Interface to isamc system now includes update and delete.
  41  *
  42  * Revision 1.27  1996/10/29 14:06:56  adam
  43  * Include zebrautl.h instead of alexutil.h.
  44  *
  45  * Revision 1.26  1996/06/04 10:19:01  adam
  46  * Minor changes - removed include of ctype.h.
  47  *
  48  * Revision 1.25  1996/05/01  13:46:37  adam
  49  * First work on multiple records in one file.
  50  * New option, -offset, to the "unread" command in the filter module.
  51  *
  52  * Revision 1.24  1996/04/26  10:00:23  adam
  53  * Added option -V to zebraidx to display version information.
  54  * Removed stupid warnings from file update.
  55  *
  56  * Revision 1.23  1996/04/12  07:02:25  adam
  57  * File update of single files.
  58  *
  59  * Revision 1.22  1996/04/09 06:50:50  adam
  60  * Bug fix: bad reference in function fileUpdateR.
  61  *
  62  * Revision 1.21  1996/03/22 15:34:18  quinn
  63  * Fixed bad reference
  64  *
  65  * Revision 1.20  1996/03/21  14:50:10  adam
  66  * File update uses modify-time instead of change-time.
  67  *
  68  * Revision 1.19  1996/03/20  16:16:55  quinn
  69  * Added diagnostic output
  70  *
  71  * Revision 1.18  1996/03/19  12:43:27  adam
  72  * Bug fix: File update traversal didn't handle trailing slashes correctly.
  73  * Bug fix: Update of sub directory groups wasn't handled correctly.
  74  *
  75  * Revision 1.17  1996/02/12  18:45:17  adam
  76  * Changed naming of some functions.
  77  *
  78  * Revision 1.16  1996/02/05  12:30:02  adam
  79  * Logging reduced a bit.
  80  * The remaining running time is estimated during register merge.
  81  *
  82  * Revision 1.15  1995/12/07  17:38:48  adam
  83  * Work locking mechanisms for concurrent updates/commit.
  84  *
  85  * Revision 1.14  1995/12/06  12:41:26  adam
  86  * New command 'stat' for the index program.
  87  * Filenames can be read from stdin by specifying '-'.
  88  * Bug fix/enhancement of the transformation from terms to regular
  89  * expressons in the search engine.
  90  *
  91  * Revision 1.13  1995/11/28  09:09:46  adam
  92  * Zebra config renamed.
  93  * Use setting 'recordId' to identify record now.
  94  * Bug fix in recindex.c: rec_release_blocks was invokeded even
  95  * though the blocks were already released.
  96  * File traversal properly deletes records when needed.
  97  *
  98  * Revision 1.12  1995/11/24  11:31:37  adam
  99  * Commands add & del read filenames from stdin if source directory is
 100  * empty.
 101  * Match criteria supports 'constant' strings.
 102  *
 103  * Revision 1.11  1995/11/22  17:19:19  adam
 104  * Record management uses the bfile system.
 105  *
 106  * Revision 1.10  1995/11/21  15:01:16  adam
 107  * New general match criteria implemented.
 108  * New feature: document groups.
 109  *
 110  * Revision 1.9  1995/11/21  09:20:32  adam
 111  * Yet more work on record match.
 112  *
 113  * Revision 1.8  1995/11/20  16:59:46  adam
 114  * New update method: the 'old' keys are saved for each records.
 115  *
 116  * Revision 1.7  1995/11/20  11:56:28  adam
 117  * Work on new traversal.
 118  *
 119  * Revision 1.6  1995/11/17  15:54:42  adam
 120  * Started work on virtual directory structure.
 121  *
 122  * Revision 1.5  1995/10/17  18:02:09  adam
 123  * New feature: databases. Implemented as prefix to words in dictionary.
 124  *
 125  * Revision 1.4  1995/09/28  09:19:46  adam
 126  * xfree/xmalloc used everywhere.
 127  * Extract/retrieve method seems to work for text records.
 128  *
 129  * Revision 1.3  1995/09/06  16:11:18  adam
 130  * Option: only one word key per file.
 131  *
 132  * Revision 1.2  1995/09/04  12:33:43  adam
 133  * Various cleanup. YAZ util used instead.
 134  *
 135  * Revision 1.1  1995/09/01  14:06:36  adam
 136  * Split of work into more files.
 137  *
 138  */
 139
 140
 141 #include <stdio.h>
 142 #include <assert.h>
 143 #include <sys/types.h>
 144 #include <sys/stat.h>
 145 #ifdef WIN32
 146 #include <io.h>
 147 #define S_ISREG(x) (x & _S_IFREG)
 148 #define S_ISDIR(x) (x & _S_IFDIR)
 149 #else
 150 #include <unistd.h>
 151 #endif
 152 #include <direntz.h>
 153 #include <fcntl.h>
 154 #include <time.h>
 155
 156 #include "index.h"
 157 #include "zserver.h"
 158
 159 static int repComp (const char *a, const char *b, size_t len)
 160 {
 161     if (!len)
 162         return 0;
 163     return memcmp (a, b, len);
 164 }
 165
 166 static void repositoryExtractR (ZebraHandle zh, int deleteFlag, char *rep,
 167                                 struct recordGroup *rGroup,
 168                                 int level)
 169 {
 170     struct dir_entry *e;
 171     int i;
 172     size_t rep_len = strlen (rep);
 173
 174     e = dir_open (rep);
 175     if (!e)
 176         return;
 177     logf (LOG_LOG, "dir %s", rep);
 178     if (rep[rep_len-1] != '/')
 179         rep[rep_len] = '/';
 180     else
 181         --rep_len;
 182
 183     for (i=0; e[i].name; i++)
 184     {
 185         char *ecp;
 186         strcpy (rep +rep_len+1, e[i].name);
 187         if ((ecp = strrchr (e[i].name, '/')))
 188             *ecp = '\0';
 189         if (level == 0 && rGroup->databaseNamePath)
 190             rGroup->databaseName = e[i].name;
 191
 192         switch (e[i].kind)
 193         {
 194         case dirs_file:
 195             fileExtract (zh, NULL, rep, rGroup, deleteFlag);
 196             break;
 197         case dirs_dir:
 198             repositoryExtractR (zh, deleteFlag, rep, rGroup, level+1);
 199             break;
 200         }
 201     }
 202     dir_free (&e);
 203
 204 }
 205
 206 static void fileDeleteR (ZebraHandle zh,
 207                          struct dirs_info *di, struct dirs_entry *dst,
 208                          const char *base, char *src,
 209                          struct recordGroup *rGroup)
 210 {
 211     char tmppath[1024];
 212     size_t src_len = strlen (src);
 213
 214     while (dst && !repComp (dst->path, src, src_len+1))
 215     {
 216         switch (dst->kind)
 217         {
 218         case dirs_file:
 219             sprintf (tmppath, "%s%s", base, dst->path);
 220             fileExtract (zh, &dst->sysno, tmppath, rGroup, 1);
 221
 222             strcpy (tmppath, dst->path);
 223             dst = dirs_read (di);
 224             dirs_del (di, tmppath);
 225             break;
 226         case dirs_dir:
 227             strcpy (tmppath, dst->path);
 228             dst = dirs_read (di);
 229             dirs_rmdir (di, tmppath);
 230             break;
 231         default:
 232             dst = dirs_read (di);
 233         }
 234     }
 235 }
 236
 237 static void fileUpdateR (ZebraHandle zh,
 238                          struct dirs_info *di, struct dirs_entry *dst,
 239                          const char *base, char *src,
 240                          struct recordGroup *rGroup,
 241                          int level)
 242 {
 243     struct dir_entry *e_src;
 244     int i_src = 0;
 245     static char tmppath[1024];
 246     size_t src_len = strlen (src);
 247
 248     sprintf (tmppath, "%s%s", base, src);
 249     e_src = dir_open (tmppath);
 250     logf (LOG_LOG, "dir %s", tmppath);
 251
 252 #if 0
 253     if (!dst || repComp (dst->path, src, src_len))
 254 #else
 255     if (!dst || strcmp (dst->path, src))
 256 #endif
 257     {
 258         if (!e_src)
 259             return;
 260
 261         if (src_len && src[src_len-1] != '/')
 262         {
 263             src[src_len] = '/';
 264             src[++src_len] = '\0';
 265         }
 266         dirs_mkdir (di, src, 0);
 267         if (dst && repComp (dst->path, src, src_len))
 268             dst = NULL;
 269     }
 270     else if (!e_src)
 271     {
 272         strcpy (src, dst->path);
 273         fileDeleteR (zh, di, dst, base, src, rGroup);
 274         return;
 275     }
 276     else
 277     {
 278         if (src_len && src[src_len-1] != '/')
 279         {
 280             src[src_len] = '/';
 281             src[++src_len] = '\0';
 282         }
 283         dst = dirs_read (di);
 284     }
 285     dir_sort (e_src);
 286
 287     while (1)
 288     {
 289         int sd;
 290
 291         if (dst && !repComp (dst->path, src, src_len))
 292         {
 293             if (e_src[i_src].name)
 294             {
 295                 logf (LOG_DEBUG, "dst=%s src=%s", dst->path + src_len,
 296                       e_src[i_src].name);
 297                 sd = strcmp (dst->path + src_len, e_src[i_src].name);
 298             }
 299             else
 300                 sd = -1;
 301         }
 302         else if (e_src[i_src].name)
 303             sd = 1;
 304         else
 305             break;
 306         logf (LOG_DEBUG, "trav sd=%d", sd);
 307
 308         if (level == 0 && rGroup->databaseNamePath)
 309             rGroup->databaseName = e_src[i_src].name;
 310         if (sd == 0)
 311         {
 312             strcpy (src + src_len, e_src[i_src].name);
 313             sprintf (tmppath, "%s%s", base, src);
 314
 315             switch (e_src[i_src].kind)
 316             {
 317             case dirs_file:
 318                 if (e_src[i_src].mtime > dst->mtime)
 319                 {
 320                     if (fileExtract (zh, &dst->sysno, tmppath, rGroup, 0))
 321                     {
 322                         dirs_add (di, src, dst->sysno, e_src[i_src].mtime);
 323                     }
 324                     logf (LOG_DEBUG, "old: %s", ctime (&dst->mtime));
 325                     logf (LOG_DEBUG, "new: %s", ctime (&e_src[i_src].mtime));
 326                 }
 327                 dst = dirs_read (di);
 328                 break;
 329             case dirs_dir:
 330                 fileUpdateR (zh, di, dst, base, src, rGroup, level+1);
 331                 dst = dirs_last (di);
 332                 logf (LOG_DEBUG, "last is %s", dst ? dst->path : "null");
 333                 break;
 334             default:
 335                 dst = dirs_read (di);
 336             }
 337             i_src++;
 338         }
 339         else if (sd > 0)
 340         {
 341             SYSNO sysno = 0;
 342             strcpy (src + src_len, e_src[i_src].name);
 343             sprintf (tmppath, "%s%s", base, src);
 344
 345             switch (e_src[i_src].kind)
 346             {
 347             case dirs_file:
 348                 if (fileExtract (zh, &sysno, tmppath, rGroup, 0))
 349                     dirs_add (di, src, sysno, e_src[i_src].mtime);
 350                 break;
 351             case dirs_dir:
 352                 fileUpdateR (zh, di, dst, base, src, rGroup, level+1);
 353                 if (dst)
 354                     dst = dirs_last (di);
 355                 break;
 356             }
 357             i_src++;
 358         }
 359         else  /* sd < 0 */
 360         {
 361             strcpy (src, dst->path);
 362             sprintf (tmppath, "%s%s", base, dst->path);
 363
 364             switch (dst->kind)
 365             {
 366             case dirs_file:
 367                 fileExtract (zh, &dst->sysno, tmppath, rGroup, 1);
 368                 dirs_del (di, dst->path);
 369                 dst = dirs_read (di);
 370                 break;
 371             case dirs_dir:
 372                 fileDeleteR (zh, di, dst, base, src, rGroup);
 373                 dst = dirs_last (di);
 374             }
 375         }
 376     }
 377     dir_free (&e_src);
 378 }
 379
 380 static void groupRes (ZebraService zs, struct recordGroup *rGroup)
 381 {
 382     char resStr[256];
 383     char gPrefix[256];
 384
 385     if (!rGroup->groupName || !*rGroup->groupName)
 386         *gPrefix = '\0';
 387     else
 388         sprintf (gPrefix, "%s.", rGroup->groupName);
 389
 390     sprintf (resStr, "%srecordId", gPrefix);
 391     rGroup->recordId = res_get (zs->res, resStr);
 392     sprintf (resStr, "%sdatabasePath", gPrefix);
 393     rGroup->databaseNamePath =
 394         atoi (res_get_def (zs->res, resStr, "0"));
 395 }
 396
 397 void repositoryShow (ZebraHandle zh)
 398
 399 {
 400     struct recordGroup *rGroup = &zh->rGroup;
 401     char src[1024];
 402     int src_len;
 403     struct dirs_entry *dst;
 404     Dict dict;
 405     struct dirs_info *di;
 406
 407     if (!(dict = dict_open (zh->service->bfs, FMATCH_DICT, 50, 0, 0)))
 408     {
 409         logf (LOG_FATAL, "dict_open fail of %s", FMATCH_DICT);
 410         return;
 411     }
 412
 413     assert (rGroup->path);
 414     strcpy (src, rGroup->path);
 415     src_len = strlen (src);
 416
 417     if (src_len && src[src_len-1] != '/')
 418     {
 419         src[src_len] = '/';
 420         src[++src_len] = '\0';
 421     }
 422
 423     di = dirs_open (dict, src, rGroup->flagRw);
 424
 425     while ( (dst = dirs_read (di)) )
 426         logf (LOG_LOG, "%s", dst->path);
 427     dirs_free (&di);
 428     dict_close (dict);
 429 }
 430
 431 static void fileUpdate (ZebraHandle zh,
 432                         Dict dict, struct recordGroup *rGroup,
 433                         const char *path)
 434 {
 435     struct dirs_info *di;
 436     struct stat sbuf;
 437     char src[1024];
 438     char dst[1024];
 439     int src_len;
 440
 441     assert (path);
 442     strcpy (src, path);
 443     src_len = strlen (src);
 444
 445     stat (src, &sbuf);
 446     if (S_ISREG(sbuf.st_mode))
 447     {
 448         struct dirs_entry *e_dst;
 449         di = dirs_fopen (dict, src);
 450
 451         e_dst = dirs_read (di);
 452         if (e_dst)
 453         {
 454             if (sbuf.st_mtime > e_dst->mtime)
 455                 if (fileExtract (zh, &e_dst->sysno, src, rGroup, 0))
 456                     dirs_add (di, src, e_dst->sysno, sbuf.st_mtime);
 457         }
 458         else
 459         {
 460             SYSNO sysno = 0;
 461             if (fileExtract (zh, &sysno, src, rGroup, 0))
 462                  dirs_add (di, src, sysno, sbuf.st_mtime);
 463         }
 464         dirs_free (&di);
 465     }
 466     else if (S_ISDIR(sbuf.st_mode))
 467     {
 468         if (src_len && src[src_len-1] != '/')
 469         {
 470             src[src_len] = '/';
 471             src[++src_len] = '\0';
 472         }
 473         di = dirs_open (dict, src, rGroup->flagRw);
 474         *dst = '\0';
 475         fileUpdateR (zh, di, dirs_read (di), src, dst, rGroup, 0);
 476         dirs_free (&di);
 477     }
 478     else
 479     {
 480         logf (LOG_WARN, "Ignoring path %s", src);
 481     }
 482 }
 483
 484
 485 static void repositoryExtract (ZebraHandle zh,
 486                                int deleteFlag, struct recordGroup *rGroup,
 487                                const char *path)
 488 {
 489     struct stat sbuf;
 490     char src[1024];
 491
 492     assert (path);
 493     strcpy (src, path);
 494
 495     stat (src, &sbuf);
 496     if (S_ISREG(sbuf.st_mode))
 497         fileExtract (zh, NULL, src, rGroup, deleteFlag);
 498     else if (S_ISDIR(sbuf.st_mode))
 499         repositoryExtractR (zh, deleteFlag, src, rGroup, 0);
 500     else
 501         logf (LOG_WARN, "Ignoring path %s", src);
 502 }
 503
 504 static void repositoryExtractG (ZebraHandle zh,
 505                                 int deleteFlag, struct recordGroup *rGroup)
 506 {
 507     if (*rGroup->path == '\0' || !strcmp(rGroup->path, "-"))
 508     {
 509         char src[1024];
 510
 511         while (scanf ("%s", src) == 1)
 512             repositoryExtract (zh, deleteFlag, rGroup, src);
 513     }
 514     else
 515         repositoryExtract (zh, deleteFlag, rGroup, rGroup->path);
 516 }
 517
 518 void repositoryUpdate (ZebraHandle zh)
 519 {
 520     struct recordGroup *rGroup = &zh->rGroup;
 521     groupRes (zh->service, rGroup);
 522     assert (rGroup->path);
 523     if (rGroup->recordId && !strcmp (rGroup->recordId, "file"))
 524     {
 525         Dict dict;
 526         if (!(dict = dict_open (zh->service->bfs, FMATCH_DICT, 50,
 527                                 rGroup->flagRw, 0)))
 528         {
 529             logf (LOG_FATAL, "dict_open fail of %s", FMATCH_DICT);
 530             return ;
 531         }
 532         if (*rGroup->path == '\0' || !strcmp(rGroup->path, "-"))
 533         {
 534             char src[1024];
 535             while (scanf ("%s", src) == 1)
 536                 fileUpdate (zh, dict, rGroup, src);
 537         }
 538         else
 539             fileUpdate (zh, dict, rGroup, rGroup->path);
 540         dict_close (dict);
 541     }
 542     else
 543         repositoryExtractG (zh, 0, rGroup);
 544 }
 545
 546 void repositoryDelete (ZebraHandle zh)
 547 {
 548     repositoryExtractG (zh, 1, &zh->rGroup);
 549 }
 550