blob: 0e3a99d07fb11b644a19cc04a2349ba6e0c3cbdf [file] [log] [blame]
/****************************************************************************
**
** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
** All rights reserved.
** Contact: Nokia Corporation (qt-info@nokia.com)
**
** This file is part of the Qt Assistant of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** GNU Lesser General Public License Usage
** This file may be used under the terms of the GNU Lesser General Public
** License version 2.1 as published by the Free Software Foundation and
** appearing in the file LICENSE.LGPL included in the packaging of this
** file. Please review the following information to ensure the GNU Lesser
** General Public License version 2.1 requirements will be met:
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights. These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU General
** Public License version 3.0 as published by the Free Software Foundation
** and appearing in the file LICENSE.GPL included in the packaging of this
** file. Please review the following information to ensure the GNU General
** Public License version 3.0 requirements will be met:
** http://www.gnu.org/copyleft/gpl.html.
**
** Other Usage
** Alternatively, this file may be used in accordance with the terms and
** conditions contained in a signed written agreement between you and Nokia.
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include "qhelpenginecore.h"
#include "qhelpsearchindexreader_default_p.h"
#include <QtCore/QDir>
#include <QtCore/QUrl>
#include <QtCore/QFile>
#include <QtCore/QVariant>
#include <QtCore/QFileInfo>
#include <QtCore/QDataStream>
#include <QtCore/QTextStream>
QT_BEGIN_NAMESPACE
namespace fulltextsearch {
namespace std {
namespace {
QStringList split( const QString &str )
{
QStringList lst;
int j = 0;
int i = str.indexOf(QLatin1Char('*'), j );
if (str.startsWith(QLatin1String("*")))
lst << QLatin1String("*");
while ( i != -1 ) {
if ( i > j && i <= (int)str.length() ) {
lst << str.mid( j, i - j );
lst << QLatin1String("*");
}
j = i + 1;
i = str.indexOf(QLatin1Char('*'), j );
}
int l = str.length() - 1;
if ( str.mid( j, l - j + 1 ).length() > 0 )
lst << str.mid( j, l - j + 1 );
return lst;
}
}
Reader::Reader()
: indexPath(QString())
, indexFile(QString())
, documentFile(QString())
{
termList.clear();
indexTable.clear();
searchIndexTable.clear();
}
Reader::~Reader()
{
reset();
searchIndexTable.clear();
}
bool Reader::readIndex()
{
if (indexTable.contains(indexFile))
return true;
QFile idxFile(indexFile);
if (!idxFile.open(QFile::ReadOnly))
return false;
QString key;
int numOfDocs;
EntryTable entryTable;
QVector<Document> docs;
QDataStream dictStream(&idxFile);
while (!dictStream.atEnd()) {
dictStream >> key;
dictStream >> numOfDocs;
docs.resize(numOfDocs);
dictStream >> docs;
entryTable.insert(key, new Entry(docs));
}
idxFile.close();
if (entryTable.isEmpty())
return false;
QFile docFile(documentFile);
if (!docFile.open(QFile::ReadOnly))
return false;
QString title, url;
DocumentList documentList;
QDataStream docStream(&docFile);
while (!docStream.atEnd()) {
docStream >> title;
docStream >> url;
documentList.append(QStringList(title) << url);
}
docFile.close();
if (documentList.isEmpty()) {
cleanupIndex(entryTable);
return false;
}
indexTable.insert(indexFile, Index(entryTable, documentList));
return true;
}
bool Reader::initCheck() const
{
return !searchIndexTable.isEmpty();
}
void Reader::setIndexPath(const QString &path)
{
indexPath = path;
}
void Reader::filterFilesForAttributes(const QStringList &attributes)
{
searchIndexTable.clear();
for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
const QString fileName = it.key();
bool containsAll = true;
QStringList split = fileName.split(QLatin1String("@"));
foreach (const QString &attribute, attributes) {
if (!split.contains(attribute, Qt::CaseInsensitive)) {
containsAll = false;
break;
}
}
if (containsAll)
searchIndexTable.insert(fileName, it.value());
}
}
void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
{
QString extension = namespaceName + QLatin1String("@") + attributes;
indexFile = indexPath + QLatin1String("/indexdb40.") + extension;
documentFile = indexPath + QLatin1String("/indexdoc40.") + extension;
}
bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
QStringList *termSeq, QStringList *seqWords)
{
QString term = searchTerm;
term = term.simplified();
term = term.replace(QLatin1String("\'"), QLatin1String("\""));
term = term.replace(QLatin1String("`"), QLatin1String("\""));
term = term.replace(QLatin1String("-"), QLatin1String(" "));
term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
*terms = term.split(QLatin1Char(' '));
QStringList::iterator it = terms->begin();
for (; it != terms->end(); ++it) {
(*it) = (*it).simplified();
(*it) = (*it).toLower();
(*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
}
if (term.contains(QLatin1Char('\"'))) {
if ((term.count(QLatin1Char('\"')))%2 == 0) {
int beg = 0;
int end = 0;
QString s;
beg = term.indexOf(QLatin1Char('\"'), beg);
while (beg != -1) {
beg++;
end = term.indexOf(QLatin1Char('\"'), beg);
s = term.mid(beg, end - beg);
s = s.toLower();
s = s.simplified();
if (s.contains(QLatin1Char('*'))) {
qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
return false;
}
*seqWords += s.split(QLatin1Char(' '));
*termSeq << s;
beg = term.indexOf(QLatin1Char('\"'), end + 1);
}
} else {
qWarning("Full Text Search, the closing quotation mark is missing.");
return false;
}
}
return true;
}
void Reader::searchInIndex(const QStringList &terms)
{
foreach (const QString &term, terms) {
QVector<Document> documents;
for(IndexTable::ConstIterator it = searchIndexTable.begin();
it != searchIndexTable.end(); ++it) {
EntryTable entryTable = it.value().first;
DocumentList documentList = it.value().second;
if (term.contains(QLatin1Char('*')))
documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
else if (entryTable.value(term))
documents = entryTable.value(term)->documents;
else
continue;
if (!documents.isEmpty()) {
DocumentInfo info;
QString title, url;
QVector<DocumentInfo> documentsInfo;
foreach(const Document &doc, documents) {
info.docNumber = doc.docNumber;
info.frequency = doc.frequency;
info.documentUrl = documentList.at(doc.docNumber).at(1);
info.documentTitle = documentList.at(doc.docNumber).at(0);
documentsInfo.append(info);
}
bool found = false;
for(QList<TermInfo>::Iterator tit = termList.begin();
tit != termList.end(); ++tit) {
TermInfo *t = &(*tit);
if(t->term == term) {
t->documents += documentsInfo;
t->frequency += documentsInfo.count();
found = true; break;
}
}
if (!found)
termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
}
}
}
qSort(termList);
}
QVector<DocumentInfo> Reader::hits()
{
QVector<DocumentInfo> documents;
if (!termList.count())
return documents;
documents = termList.takeFirst().documents;
for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
TermInfo *t = &(*it);
QVector<DocumentInfo> docs = t->documents;
for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
minDoc_it != documents.end(); ) {
bool found = false;
for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
doc_it != docs.constEnd(); ++doc_it ) {
if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
(*minDoc_it).frequency += (*doc_it).frequency;
found = true;
break;
}
}
if (!found)
minDoc_it = documents.erase(minDoc_it);
else
++minDoc_it;
}
}
qSort(documents);
return documents;
}
bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
const QByteArray &data)
{
if (data.isEmpty())
return false;
for(QHash<QString, PosEntry*>::ConstIterator mit =
miniIndex.begin(); mit != miniIndex.end(); ++mit) {
delete mit.value();
}
miniIndex.clear();
wordNum = 3;
QStringList::ConstIterator cIt = words.begin();
for ( ; cIt != words.end(); ++cIt )
miniIndex.insert(*cIt, new PosEntry(0));
QTextStream s(data);
QString text = s.readAll();
bool valid = true;
const QChar *buf = text.unicode();
QChar str[64];
QChar c = buf[0];
int j = 0;
int i = 0;
while ( j < text.length() ) {
if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
valid = false;
if ( i > 1 )
buildMiniIndex( QString(str,i) );
i = 0;
c = buf[++j];
continue;
}
if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
valid = true;
c = buf[++j];
continue;
}
if ( !valid ) {
c = buf[++j];
continue;
}
if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
str[i] = c.toLower();
++i;
} else {
if ( i > 1 )
buildMiniIndex( QString(str,i) );
i = 0;
}
c = buf[++j];
}
if ( i > 1 )
buildMiniIndex( QString(str,i) );
QStringList::ConstIterator patIt = patterns.begin();
QStringList wordLst;
QList<uint> a, b;
QList<uint>::iterator aIt;
for ( ; patIt != patterns.end(); ++patIt ) {
wordLst = (*patIt).split(QLatin1Char(' '));
a = miniIndex[ wordLst[0] ]->positions;
for ( int j = 1; j < (int)wordLst.count(); ++j ) {
b = miniIndex[ wordLst[j] ]->positions;
aIt = a.begin();
while ( aIt != a.end() ) {
if ( b.contains( *aIt + 1 )) {
(*aIt)++;
++aIt;
} else {
aIt = a.erase( aIt );
}
}
}
}
if ( a.count() )
return true;
return false;
}
QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
const EntryTable &entryTable)
{
QList<Term> termList;
for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
if (entryTable.value(*it)) {
Entry *e = entryTable.value(*it);
termList.append(Term(*it, e->documents.count(), e->documents ) );
}
}
QVector<Document> maxList(0);
if ( !termList.count() )
return maxList;
qSort(termList);
maxList = termList.takeLast().documents;
for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
Term *t = &(*it);
QVector<Document> docs = t->documents;
for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
if ( maxList.indexOf( *docIt ) == -1 )
maxList.append( *docIt );
}
}
return maxList;
}
QStringList Reader::getWildcardTerms(const QString &term,
const EntryTable &entryTable)
{
QStringList lst;
QStringList terms = split(term);
QStringList::Iterator iter;
for(EntryTable::ConstIterator it = entryTable.begin();
it != entryTable.end(); ++it) {
int index = 0;
bool found = false;
QString text( it.key() );
for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
if ( *iter == QLatin1String("*") ) {
found = true;
continue;
}
if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
found = false;
break;
}
index = text.indexOf( *iter, index );
if ( *iter == terms.last() && index != (int)text.length()-1 ) {
index = text.lastIndexOf( *iter );
if ( index != (int)text.length() - (int)(*iter).length() ) {
found = false;
break;
}
}
if ( index != -1 ) {
found = true;
index += (*iter).length();
continue;
} else {
found = false;
break;
}
}
if (found)
lst << text;
}
return lst;
}
void Reader::buildMiniIndex(const QString &string)
{
if (miniIndex[string])
miniIndex[string]->positions.append(wordNum);
++wordNum;
}
void Reader::reset()
{
for(IndexTable::Iterator it = indexTable.begin();
it != indexTable.end(); ++it) {
cleanupIndex(it.value().first);
it.value().second.clear();
}
}
void Reader::cleanupIndex(EntryTable &entryTable)
{
for(EntryTable::ConstIterator it =
entryTable.begin(); it != entryTable.end(); ++it) {
delete it.value();
}
entryTable.clear();
}
QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault()
: QHelpSearchIndexReader()
{
// nothing todo
}
QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault()
{
}
void QHelpSearchIndexReaderDefault::run()
{
mutex.lock();
if (m_cancel) {
mutex.unlock();
return;
}
const QList<QHelpSearchQuery> &queryList = this->m_query;
const QLatin1String key("DefaultSearchNamespaces");
const QString collectionFile(this->m_collectionFile);
const QString indexPath = m_indexFilesFolder;
mutex.unlock();
QString queryTerm;
foreach (const QHelpSearchQuery &query, queryList) {
if (query.fieldName == QHelpSearchQuery::DEFAULT) {
queryTerm = query.wordList.at(0);
break;
}
}
if (queryTerm.isEmpty())
return;
QHelpEngineCore engine(collectionFile, 0);
if (!engine.setupData())
return;
const QStringList registeredDocs = engine.registeredDocumentations();
const QStringList indexedNamespaces = engine.customValue(key).toString().
split(QLatin1String("|"), QString::SkipEmptyParts);
emit searchingStarted();
// setup the reader
m_reader.setIndexPath(indexPath);
foreach(const QString &namespaceName, registeredDocs) {
mutex.lock();
if (m_cancel) {
mutex.unlock();
searchingFinished(0); // TODO: check this ???
return;
}
mutex.unlock();
const QList<QStringList> attributeSets =
engine.filterAttributeSets(namespaceName);
foreach (const QStringList &attributes, attributeSets) {
// read all index files
m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
if (!m_reader.readIndex()) {
qWarning("Full Text Search, could not read file for namespace: %s.",
namespaceName.toUtf8().constData());
}
}
}
// get the current filter attributes and minimize the index files table
m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
hitList.clear();
QStringList terms, termSeq, seqWords;
if (m_reader.initCheck() && // check if we could read anything
m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
// search for term(s)
m_reader.searchInIndex(terms); // TODO: should this be interruptible as well ???
QVector<DocumentInfo> hits = m_reader.hits();
if (!hits.isEmpty()) {
if (termSeq.isEmpty()) {
foreach (const DocumentInfo &docInfo, hits) {
mutex.lock();
if (m_cancel) {
mutex.unlock();
searchingFinished(0); // TODO: check this, speed issue while locking???
return;
}
mutex.unlock();
hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
}
} else {
foreach (const DocumentInfo &docInfo, hits) {
mutex.lock();
if (m_cancel) {
mutex.unlock();
searchingFinished(0); // TODO: check this, speed issue while locking???
return;
}
mutex.unlock();
if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
}
}
}
}
emit searchingFinished(hitList.count());
}
} // namespace std
} // namespace fulltextsearch
QT_END_NAMESPACE