index/trav.c

   1 /*
   2  * Copyright (C) 1994-1999, Index Data
   3  * All rights reserved.
   4  * Sebastian Hammer, Adam Dickmeiss
   5  *
   6  * $Log: trav.c,v $
   7  * Revision 1.38  2002-04-04 14:14:13  adam
   8  * Multiple registers (alpha early)
   9  *
  10  * Revision 1.37  2002/02/20 17:30:01  adam
  11  * Work on new API. Locking system re-implemented
  12  *
  13  * Revision 1.36  1999/05/15 14:36:38  adam
  14  * Updated dictionary. Implemented "compression" of dictionary.
  15  *
  16  * Revision 1.35  1999/02/02 14:51:09  adam
  17  * Updated WIN32 code specific sections. Changed header.
  18  *
  19  * Revision 1.34  1998/06/08 14:43:14  adam
  20  * Added suport for EXPLAIN Proxy servers - added settings databasePath
  21  * and explainDatabase to facilitate this. Increased maximum number
  22  * of databases and attributes in one register.
  23  *
  24  * Revision 1.33  1998/01/12 15:04:08  adam
  25  * The test option (-s) only uses read-lock (and not write lock).
  26  *
  27  * Revision 1.32  1997/09/25 14:56:51  adam
  28  * Windows NT interface code to the stat call.
  29  *
  30  * Revision 1.31  1997/09/17 12:19:17  adam
  31  * Zebra version corresponds to YAZ version 1.4.
  32  * Changed Zebra server so that it doesn't depend on global common_resource.
  33  *
  34  * Revision 1.30  1997/09/09 13:38:09  adam
  35  * Partial port to WIN95/NT.
  36  *
  37  * Revision 1.29  1997/02/12 20:39:47  adam
  38  * Implemented options -f <n> that limits the log to the first <n>
  39  * records.
  40  * Changed some log messages also.
  41  *
  42  * Revision 1.28  1996/11/01 08:58:44  adam
  43  * Interface to isamc system now includes update and delete.
  44  *
  45  * Revision 1.27  1996/10/29 14:06:56  adam
  46  * Include zebrautl.h instead of alexutil.h.
  47  *
  48  * Revision 1.26  1996/06/04 10:19:01  adam
  49  * Minor changes - removed include of ctype.h.
  50  *
  51  * Revision 1.25  1996/05/01  13:46:37  adam
  52  * First work on multiple records in one file.
  53  * New option, -offset, to the "unread" command in the filter module.
  54  *
  55  * Revision 1.24  1996/04/26  10:00:23  adam
  56  * Added option -V to zebraidx to display version information.
  57  * Removed stupid warnings from file update.
  58  *
  59  * Revision 1.23  1996/04/12  07:02:25  adam
  60  * File update of single files.
  61  *
  62  * Revision 1.22  1996/04/09 06:50:50  adam
  63  * Bug fix: bad reference in function fileUpdateR.
  64  *
  65  * Revision 1.21  1996/03/22 15:34:18  quinn
  66  * Fixed bad reference
  67  *
  68  * Revision 1.20  1996/03/21  14:50:10  adam
  69  * File update uses modify-time instead of change-time.
  70  *
  71  * Revision 1.19  1996/03/20  16:16:55  quinn
  72  * Added diagnostic output
  73  *
  74  * Revision 1.18  1996/03/19  12:43:27  adam
  75  * Bug fix: File update traversal didn't handle trailing slashes correctly.
  76  * Bug fix: Update of sub directory groups wasn't handled correctly.
  77  *
  78  * Revision 1.17  1996/02/12  18:45:17  adam
  79  * Changed naming of some functions.
  80  *
  81  * Revision 1.16  1996/02/05  12:30:02  adam
  82  * Logging reduced a bit.
  83  * The remaining running time is estimated during register merge.
  84  *
  85  * Revision 1.15  1995/12/07  17:38:48  adam
  86  * Work locking mechanisms for concurrent updates/commit.
  87  *
  88  * Revision 1.14  1995/12/06  12:41:26  adam
  89  * New command 'stat' for the index program.
  90  * Filenames can be read from stdin by specifying '-'.
  91  * Bug fix/enhancement of the transformation from terms to regular
  92  * expressons in the search engine.
  93  *
  94  * Revision 1.13  1995/11/28  09:09:46  adam
  95  * Zebra config renamed.
  96  * Use setting 'recordId' to identify record now.
  97  * Bug fix in recindex.c: rec_release_blocks was invokeded even
  98  * though the blocks were already released.
  99  * File traversal properly deletes records when needed.
 100  *
 101  * Revision 1.12  1995/11/24  11:31:37  adam
 102  * Commands add & del read filenames from stdin if source directory is
 103  * empty.
 104  * Match criteria supports 'constant' strings.
 105  *
 106  * Revision 1.11  1995/11/22  17:19:19  adam
 107  * Record management uses the bfile system.
 108  *
 109  * Revision 1.10  1995/11/21  15:01:16  adam
 110  * New general match criteria implemented.
 111  * New feature: document groups.
 112  *
 113  * Revision 1.9  1995/11/21  09:20:32  adam
 114  * Yet more work on record match.
 115  *
 116  * Revision 1.8  1995/11/20  16:59:46  adam
 117  * New update method: the 'old' keys are saved for each records.
 118  *
 119  * Revision 1.7  1995/11/20  11:56:28  adam
 120  * Work on new traversal.
 121  *
 122  * Revision 1.6  1995/11/17  15:54:42  adam
 123  * Started work on virtual directory structure.
 124  *
 125  * Revision 1.5  1995/10/17  18:02:09  adam
 126  * New feature: databases. Implemented as prefix to words in dictionary.
 127  *
 128  * Revision 1.4  1995/09/28  09:19:46  adam
 129  * xfree/xmalloc used everywhere.
 130  * Extract/retrieve method seems to work for text records.
 131  *
 132  * Revision 1.3  1995/09/06  16:11:18  adam
 133  * Option: only one word key per file.
 134  *
 135  * Revision 1.2  1995/09/04  12:33:43  adam
 136  * Various cleanup. YAZ util used instead.
 137  *
 138  * Revision 1.1  1995/09/01  14:06:36  adam
 139  * Split of work into more files.
 140  *
 141  */
 142
 143
 144 #include <stdio.h>
 145 #include <assert.h>
 146 #include <sys/types.h>
 147 #include <sys/stat.h>
 148 #ifdef WIN32
 149 #include <io.h>
 150 #define S_ISREG(x) (x & _S_IFREG)
 151 #define S_ISDIR(x) (x & _S_IFDIR)
 152 #else
 153 #include <unistd.h>
 154 #endif
 155 #include <direntz.h>
 156 #include <fcntl.h>
 157 #include <time.h>
 158
 159 #include "index.h"
 160
 161 static int repComp (const char *a, const char *b, size_t len)
 162 {
 163     if (!len)
 164         return 0;
 165     return memcmp (a, b, len);
 166 }
 167
 168 static void repositoryExtractR (ZebraHandle zh, int deleteFlag, char *rep,
 169                                 struct recordGroup *rGroup,
 170                                 int level)
 171 {
 172     struct dir_entry *e;
 173     int i;
 174     size_t rep_len = strlen (rep);
 175
 176     e = dir_open (rep);
 177     if (!e)
 178         return;
 179     logf (LOG_LOG, "dir %s", rep);
 180     if (rep[rep_len-1] != '/')
 181         rep[rep_len] = '/';
 182     else
 183         --rep_len;
 184
 185     for (i=0; e[i].name; i++)
 186     {
 187         char *ecp;
 188         strcpy (rep +rep_len+1, e[i].name);
 189         if ((ecp = strrchr (e[i].name, '/')))
 190             *ecp = '\0';
 191         if (level == 0 && rGroup->databaseNamePath)
 192             rGroup->databaseName = e[i].name;
 193
 194         switch (e[i].kind)
 195         {
 196         case dirs_file:
 197             fileExtract (zh, NULL, rep, rGroup, deleteFlag);
 198             break;
 199         case dirs_dir:
 200             repositoryExtractR (zh, deleteFlag, rep, rGroup, level+1);
 201             break;
 202         }
 203     }
 204     dir_free (&e);
 205
 206 }
 207
 208 static void fileDeleteR (ZebraHandle zh,
 209                          struct dirs_info *di, struct dirs_entry *dst,
 210                          const char *base, char *src,
 211                          struct recordGroup *rGroup)
 212 {
 213     char tmppath[1024];
 214     size_t src_len = strlen (src);
 215
 216     while (dst && !repComp (dst->path, src, src_len+1))
 217     {
 218         switch (dst->kind)
 219         {
 220         case dirs_file:
 221             sprintf (tmppath, "%s%s", base, dst->path);
 222             fileExtract (zh, &dst->sysno, tmppath, rGroup, 1);
 223
 224             strcpy (tmppath, dst->path);
 225             dst = dirs_read (di);
 226             dirs_del (di, tmppath);
 227             break;
 228         case dirs_dir:
 229             strcpy (tmppath, dst->path);
 230             dst = dirs_read (di);
 231             dirs_rmdir (di, tmppath);
 232             break;
 233         default:
 234             dst = dirs_read (di);
 235         }
 236     }
 237 }
 238
 239 static void fileUpdateR (ZebraHandle zh,
 240                          struct dirs_info *di, struct dirs_entry *dst,
 241                          const char *base, char *src,
 242                          struct recordGroup *rGroup,
 243                          int level)
 244 {
 245     struct dir_entry *e_src;
 246     int i_src = 0;
 247     static char tmppath[1024];
 248     size_t src_len = strlen (src);
 249
 250     sprintf (tmppath, "%s%s", base, src);
 251     e_src = dir_open (tmppath);
 252     logf (LOG_LOG, "dir %s", tmppath);
 253
 254 #if 0
 255     if (!dst || repComp (dst->path, src, src_len))
 256 #else
 257     if (!dst || strcmp (dst->path, src))
 258 #endif
 259     {
 260         if (!e_src)
 261             return;
 262
 263         if (src_len && src[src_len-1] != '/')
 264         {
 265             src[src_len] = '/';
 266             src[++src_len] = '\0';
 267         }
 268         dirs_mkdir (di, src, 0);
 269         if (dst && repComp (dst->path, src, src_len))
 270             dst = NULL;
 271     }
 272     else if (!e_src)
 273     {
 274         strcpy (src, dst->path);
 275         fileDeleteR (zh, di, dst, base, src, rGroup);
 276         return;
 277     }
 278     else
 279     {
 280         if (src_len && src[src_len-1] != '/')
 281         {
 282             src[src_len] = '/';
 283             src[++src_len] = '\0';
 284         }
 285         dst = dirs_read (di);
 286     }
 287     dir_sort (e_src);
 288
 289     while (1)
 290     {
 291         int sd;
 292
 293         if (dst && !repComp (dst->path, src, src_len))
 294         {
 295             if (e_src[i_src].name)
 296             {
 297                 logf (LOG_DEBUG, "dst=%s src=%s", dst->path + src_len,
 298                       e_src[i_src].name);
 299                 sd = strcmp (dst->path + src_len, e_src[i_src].name);
 300             }
 301             else
 302                 sd = -1;
 303         }
 304         else if (e_src[i_src].name)
 305             sd = 1;
 306         else
 307             break;
 308         logf (LOG_DEBUG, "trav sd=%d", sd);
 309
 310         if (level == 0 && rGroup->databaseNamePath)
 311             rGroup->databaseName = e_src[i_src].name;
 312         if (sd == 0)
 313         {
 314             strcpy (src + src_len, e_src[i_src].name);
 315             sprintf (tmppath, "%s%s", base, src);
 316
 317             switch (e_src[i_src].kind)
 318             {
 319             case dirs_file:
 320                 if (e_src[i_src].mtime > dst->mtime)
 321                 {
 322                     if (fileExtract (zh, &dst->sysno, tmppath, rGroup, 0))
 323                     {
 324                         dirs_add (di, src, dst->sysno, e_src[i_src].mtime);
 325                     }
 326                     logf (LOG_DEBUG, "old: %s", ctime (&dst->mtime));
 327                     logf (LOG_DEBUG, "new: %s", ctime (&e_src[i_src].mtime));
 328                 }
 329                 dst = dirs_read (di);
 330                 break;
 331             case dirs_dir:
 332                 fileUpdateR (zh, di, dst, base, src, rGroup, level+1);
 333                 dst = dirs_last (di);
 334                 logf (LOG_DEBUG, "last is %s", dst ? dst->path : "null");
 335                 break;
 336             default:
 337                 dst = dirs_read (di);
 338             }
 339             i_src++;
 340         }
 341         else if (sd > 0)
 342         {
 343             SYSNO sysno = 0;
 344             strcpy (src + src_len, e_src[i_src].name);
 345             sprintf (tmppath, "%s%s", base, src);
 346
 347             switch (e_src[i_src].kind)
 348             {
 349             case dirs_file:
 350                 if (fileExtract (zh, &sysno, tmppath, rGroup, 0))
 351                     dirs_add (di, src, sysno, e_src[i_src].mtime);
 352                 break;
 353             case dirs_dir:
 354                 fileUpdateR (zh, di, dst, base, src, rGroup, level+1);
 355                 if (dst)
 356                     dst = dirs_last (di);
 357                 break;
 358             }
 359             i_src++;
 360         }
 361         else  /* sd < 0 */
 362         {
 363             strcpy (src, dst->path);
 364             sprintf (tmppath, "%s%s", base, dst->path);
 365
 366             switch (dst->kind)
 367             {
 368             case dirs_file:
 369                 fileExtract (zh, &dst->sysno, tmppath, rGroup, 1);
 370                 dirs_del (di, dst->path);
 371                 dst = dirs_read (di);
 372                 break;
 373             case dirs_dir:
 374                 fileDeleteR (zh, di, dst, base, src, rGroup);
 375                 dst = dirs_last (di);
 376             }
 377         }
 378     }
 379     dir_free (&e_src);
 380 }
 381
 382 static void groupRes (ZebraHandle zh, struct recordGroup *rGroup)
 383 {
 384     char resStr[256];
 385     char gPrefix[256];
 386
 387     if (!rGroup->groupName || !*rGroup->groupName)
 388         *gPrefix = '\0';
 389     else
 390         sprintf (gPrefix, "%s.", rGroup->groupName);
 391
 392     sprintf (resStr, "%srecordId", gPrefix);
 393     rGroup->recordId = res_get (zh->res, resStr);
 394     sprintf (resStr, "%sdatabasePath", gPrefix);
 395     rGroup->databaseNamePath =
 396         atoi (res_get_def (zh->res, resStr, "0"));
 397 }
 398
 399 void repositoryShow (ZebraHandle zh)
 400
 401 {
 402     struct recordGroup *rGroup = &zh->rGroup;
 403     char src[1024];
 404     int src_len;
 405     struct dirs_entry *dst;
 406     Dict dict;
 407     struct dirs_info *di;
 408
 409     if (!(dict = dict_open (zh->reg->bfs, FMATCH_DICT, 50, 0, 0)))
 410     {
 411         logf (LOG_FATAL, "dict_open fail of %s", FMATCH_DICT);
 412         return;
 413     }
 414
 415     assert (rGroup->path);
 416     strcpy (src, rGroup->path);
 417     src_len = strlen (src);
 418
 419     if (src_len && src[src_len-1] != '/')
 420     {
 421         src[src_len] = '/';
 422         src[++src_len] = '\0';
 423     }
 424
 425     di = dirs_open (dict, src, rGroup->flagRw);
 426
 427     while ( (dst = dirs_read (di)) )
 428         logf (LOG_LOG, "%s", dst->path);
 429     dirs_free (&di);
 430     dict_close (dict);
 431 }
 432
 433 static void fileUpdate (ZebraHandle zh,
 434                         Dict dict, struct recordGroup *rGroup,
 435                         const char *path)
 436 {
 437     struct dirs_info *di;
 438     struct stat sbuf;
 439     char src[1024];
 440     char dst[1024];
 441     int src_len;
 442
 443     assert (path);
 444     strcpy (src, path);
 445     src_len = strlen (src);
 446
 447     stat (src, &sbuf);
 448     if (S_ISREG(sbuf.st_mode))
 449     {
 450         struct dirs_entry *e_dst;
 451         di = dirs_fopen (dict, src);
 452
 453         e_dst = dirs_read (di);
 454         if (e_dst)
 455         {
 456             if (sbuf.st_mtime > e_dst->mtime)
 457                 if (fileExtract (zh, &e_dst->sysno, src, rGroup, 0))
 458                     dirs_add (di, src, e_dst->sysno, sbuf.st_mtime);
 459         }
 460         else
 461         {
 462             SYSNO sysno = 0;
 463             if (fileExtract (zh, &sysno, src, rGroup, 0))
 464                  dirs_add (di, src, sysno, sbuf.st_mtime);
 465         }
 466         dirs_free (&di);
 467     }
 468     else if (S_ISDIR(sbuf.st_mode))
 469     {
 470         if (src_len && src[src_len-1] != '/')
 471         {
 472             src[src_len] = '/';
 473             src[++src_len] = '\0';
 474         }
 475         di = dirs_open (dict, src, rGroup->flagRw);
 476         *dst = '\0';
 477         fileUpdateR (zh, di, dirs_read (di), src, dst, rGroup, 0);
 478         dirs_free (&di);
 479     }
 480     else
 481     {
 482         logf (LOG_WARN, "Ignoring path %s", src);
 483     }
 484 }
 485
 486
 487 static void repositoryExtract (ZebraHandle zh,
 488                                int deleteFlag, struct recordGroup *rGroup,
 489                                const char *path)
 490 {
 491     struct stat sbuf;
 492     char src[1024];
 493
 494     assert (path);
 495     strcpy (src, path);
 496
 497     stat (src, &sbuf);
 498     if (S_ISREG(sbuf.st_mode))
 499         fileExtract (zh, NULL, src, rGroup, deleteFlag);
 500     else if (S_ISDIR(sbuf.st_mode))
 501         repositoryExtractR (zh, deleteFlag, src, rGroup, 0);
 502     else
 503         logf (LOG_WARN, "Ignoring path %s", src);
 504 }
 505
 506 static void repositoryExtractG (ZebraHandle zh,
 507                                 int deleteFlag, struct recordGroup *rGroup)
 508 {
 509     if (*rGroup->path == '\0' || !strcmp(rGroup->path, "-"))
 510     {
 511         char src[1024];
 512
 513         while (scanf ("%s", src) == 1)
 514             repositoryExtract (zh, deleteFlag, rGroup, src);
 515     }
 516     else
 517         repositoryExtract (zh, deleteFlag, rGroup, rGroup->path);
 518 }
 519
 520 void repositoryUpdate (ZebraHandle zh)
 521 {
 522     struct recordGroup *rGroup = &zh->rGroup;
 523     groupRes (zh, rGroup);
 524     assert (rGroup->path);
 525     if (rGroup->recordId && !strcmp (rGroup->recordId, "file"))
 526     {
 527         Dict dict;
 528         if (!(dict = dict_open (zh->reg->bfs, FMATCH_DICT, 50,
 529                                 rGroup->flagRw, 0)))
 530         {
 531             logf (LOG_FATAL, "dict_open fail of %s", FMATCH_DICT);
 532             return ;
 533         }
 534         if (*rGroup->path == '\0' || !strcmp(rGroup->path, "-"))
 535         {
 536             char src[1024];
 537             while (scanf ("%s", src) == 1)
 538                 fileUpdate (zh, dict, rGroup, src);
 539         }
 540         else
 541             fileUpdate (zh, dict, rGroup, rGroup->path);
 542         dict_close (dict);
 543     }
 544     else
 545         repositoryExtractG (zh, 0, rGroup);
 546 }
 547
 548 void repositoryDelete (ZebraHandle zh)
 549 {
 550     repositoryExtractG (zh, 1, &zh->rGroup);
 551 }
 552