added tutorial warning messages for dictBuilder
diff --git a/.gitignore b/.gitignore
index e7c9a56..0c45815 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@
# Test artefacts
tmp*
+dictionary
# tmp files
*.swp
diff --git a/NEWS b/NEWS
index 7ffa402..d01a331 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,6 @@
v0.8.0
New : updated compresson format
+Improved : better speed on clang and gcc -O2, thanks to Eric Biggers
Fixed : legacy mode with ZSTD_HEAPMODE=0, by Christopher Bergqvist
Fixed : premature end of frame when zero-sized raw block, reported by Eric Biggers
Fixed : checksum correctly checked in single-pass mode
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index f151855..75a9b1e 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -924,7 +924,7 @@
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t params)
{
- U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
+ U32 const dictListSize = MAX(MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
unsigned selectivity = params.selectivityLevel;
size_t const targetDictSize = maxDictSize;
@@ -957,17 +957,25 @@
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
DISPLAYLEVEL(3, "list %u best segments \n", nb);
for (u=1; u<=nb; u++) {
- U32 p = dictList[u].pos;
- U32 l = dictList[u].length;
- U32 d = MIN(40, l);
+ U32 pos = dictList[u].pos;
+ U32 length = dictList[u].length;
+ U32 printedLength = MIN(40, length);
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
- u, l, p, dictList[u].savings);
- ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
+ u, length, pos, dictList[u].savings);
+ ZDICT_printHex(3, (const char*)samplesBuffer+pos, printedLength);
DISPLAYLEVEL(3, "| \n");
} } }
/* create dictionary */
{ U32 dictContentSize = ZDICT_dictSize(dictList);
+ U64 const totalSamplesSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+ if (dictContentSize < targetDictSize/2) {
+ DISPLAYLEVEL(2, "! warning : created dictionary significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+ DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
+ DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
+ if (totalSamplesSize < 10 * targetDictSize)
+ DISPLAYLEVEL(2, "! consider also increasing the number of samples (total size : %u MB)\n", (U32)(totalSamplesSize>>20));
+ }
/* build dict content */
{ U32 u;
diff --git a/programs/dibio.c b/programs/dibio.c
index a61ea9c..cb864ec 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -202,9 +202,16 @@
/* Checks */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
+ g_displayLevel = params.notificationLevel;
+ if (nbFiles < 5) {
+ DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing \n");
+ DISPLAYLEVEL(2, "! Please provide one file per sample \n");
+ DISPLAYLEVEL(2, "! Avoid concatenating multiple samples into a single file \n");
+ DISPLAYLEVEL(2, "! otherwise, dictBuilder will be unable to find the beginning of each sample \n");
+ DISPLAYLEVEL(2, "! resulting in distorted statistics \n");
+ }
/* init */
- g_displayLevel = params.notificationLevel;
if (benchedSize < totalSizeToLoad)
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));