blob: 3a54c3ddbcef3403a90e8f589ed8bcee0c0eb2c6 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/history/visitsegment_database.h"
#include <math.h>
#include <algorithm>
#include <string>
#include <vector>
#include "base/logging.h"
#include "base/stl_util.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/history/core/browser/page_usage_data.h"
#include "sql/statement.h"
#include "sql/transaction.h"
// The following tables are used to store url segment information.
//
// segments
// id Primary key
// name A unique string to represent that segment. (URL derived)
// url_id ID of the url currently used to represent this segment.
//
// segment_usage
// id Primary key
// segment_id Corresponding segment id
// time_slot time stamp identifying for what day this entry is about
// visit_count Number of visit in the segment
//
namespace history {
VisitSegmentDatabase::VisitSegmentDatabase() {
}
VisitSegmentDatabase::~VisitSegmentDatabase() {
}
bool VisitSegmentDatabase::InitSegmentTables() {
// Segments table.
if (!GetDB().DoesTableExist("segments")) {
if (!GetDB().Execute("CREATE TABLE segments ("
"id INTEGER PRIMARY KEY,"
"name VARCHAR,"
"url_id INTEGER NON NULL)")) {
return false;
}
if (!GetDB().Execute(
"CREATE INDEX segments_name ON segments(name)")) {
return false;
}
}
// This was added later, so we need to try to create it even if the table
// already exists.
if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON "
"segments(url_id)"))
return false;
// Segment usage table.
if (!GetDB().DoesTableExist("segment_usage")) {
if (!GetDB().Execute("CREATE TABLE segment_usage ("
"id INTEGER PRIMARY KEY,"
"segment_id INTEGER NOT NULL,"
"time_slot INTEGER NOT NULL,"
"visit_count INTEGER DEFAULT 0 NOT NULL)")) {
return false;
}
if (!GetDB().Execute(
"CREATE INDEX segment_usage_time_slot_segment_id ON "
"segment_usage(time_slot, segment_id)")) {
return false;
}
}
// Added in a later version, so we always need to try to creat this index.
if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id "
"ON segment_usage(segment_id)"))
return false;
return true;
}
bool VisitSegmentDatabase::DropSegmentTables() {
// Dropping the tables will implicitly delete the indices.
return GetDB().Execute("DROP TABLE segments") &&
GetDB().Execute("DROP TABLE segment_usage");
}
// Note: the segment name is derived from the URL but is not a URL. It is
// a string that can be easily recreated from various URLS. Maybe this should
// be an MD5 to limit the length.
//
// static
std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
// TODO(brettw) this should probably use the registry controlled
// domains service.
GURL::Replacements r;
const char kWWWDot[] = "www.";
const int kWWWDotLen = arraysize(kWWWDot) - 1;
std::string host = url.host();
const char* host_c = host.c_str();
// Remove www. to avoid some dups.
if (static_cast<int>(host.size()) > kWWWDotLen &&
LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
r.SetHost(host.c_str(),
url::Component(kWWWDotLen,
static_cast<int>(host.size()) - kWWWDotLen));
}
// Remove other stuff we don't want.
r.ClearUsername();
r.ClearPassword();
r.ClearQuery();
r.ClearRef();
r.ClearPort();
return url.ReplaceComponents(r).spec();
}
SegmentID VisitSegmentDatabase::GetSegmentNamed(
const std::string& segment_name) {
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"SELECT id FROM segments WHERE name = ?"));
statement.BindString(0, segment_name);
if (statement.Step())
return statement.ColumnInt64(0);
return 0;
}
bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
URLID url_id) {
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"UPDATE segments SET url_id = ? WHERE id = ?"));
statement.BindInt64(0, url_id);
statement.BindInt64(1, segment_id);
return statement.Run();
}
URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"SELECT url_id FROM segments WHERE id = ?"));
statement.BindInt64(0, segment_id);
if (statement.Step())
return statement.ColumnInt64(0);
return 0;
}
SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
const std::string& segment_name) {
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"INSERT INTO segments (name, url_id) VALUES (?,?)"));
statement.BindString(0, segment_name);
statement.BindInt64(1, url_id);
if (statement.Run())
return GetDB().GetLastInsertRowId();
return 0;
}
bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
base::Time ts,
int amount) {
base::Time t = ts.LocalMidnight();
sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
"SELECT id, visit_count FROM segment_usage "
"WHERE time_slot = ? AND segment_id = ?"));
select.BindInt64(0, t.ToInternalValue());
select.BindInt64(1, segment_id);
if (!select.is_valid())
return false;
if (select.Step()) {
sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
"UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
update.BindInt64(1, select.ColumnInt64(0));
return update.Run();
} else {
sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
"INSERT INTO segment_usage "
"(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
insert.BindInt64(0, segment_id);
insert.BindInt64(1, t.ToInternalValue());
insert.BindInt64(2, static_cast<int64>(amount));
return insert.Run();
}
}
void VisitSegmentDatabase::QuerySegmentUsage(
base::Time from_time,
int max_result_count,
std::vector<PageUsageData*>* results) {
// This function gathers the highest-ranked segments in two queries.
// The first gathers scores for all segments.
// The second gathers segment data (url, title, etc.) for the highest-ranked
// segments.
// Gather all the segment scores.
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"SELECT segment_id, time_slot, visit_count "
"FROM segment_usage WHERE time_slot >= ? "
"ORDER BY segment_id"));
if (!statement.is_valid())
return;
base::Time ts = from_time.LocalMidnight();
statement.BindInt64(0, ts.ToInternalValue());
base::Time now = base::Time::Now();
SegmentID last_segment_id = 0;
PageUsageData* pud = NULL;
float score = 0;
while (statement.Step()) {
SegmentID segment_id = statement.ColumnInt64(0);
if (segment_id != last_segment_id) {
if (pud) {
pud->SetScore(score);
results->push_back(pud);
}
pud = new PageUsageData(segment_id);
score = 0;
last_segment_id = segment_id;
}
base::Time timeslot =
base::Time::FromInternalValue(statement.ColumnInt64(1));
int visit_count = statement.ColumnInt(2);
int days_ago = (now - timeslot).InDays();
// Score for this day in isolation.
float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
// Recent visits count more than historical ones, so we multiply in a boost
// related to how long ago this day was.
// This boost is a curve that smoothly goes through these values:
// Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
// at the limit of how far we reach into the past.
float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
score += recency_boost * day_visits_score;
}
if (pud) {
pud->SetScore(score);
results->push_back(pud);
}
// Limit to the top kResultCount results.
std::sort(results->begin(), results->end(), PageUsageData::Predicate);
if (static_cast<int>(results->size()) > max_result_count) {
STLDeleteContainerPointers(results->begin() + max_result_count,
results->end());
results->resize(max_result_count);
}
// Now fetch the details about the entries we care about.
sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
"SELECT urls.url, urls.title FROM urls "
"JOIN segments ON segments.url_id = urls.id "
"WHERE segments.id = ?"));
if (!statement2.is_valid())
return;
for (size_t i = 0; i < results->size(); ++i) {
PageUsageData* pud = (*results)[i];
statement2.BindInt64(0, pud->GetID());
if (statement2.Step()) {
pud->SetURL(GURL(statement2.ColumnString(0)));
pud->SetTitle(statement2.ColumnString16(1));
}
statement2.Reset(true);
}
}
bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
"DELETE FROM segment_usage WHERE time_slot < ?"));
statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
return statement.Run();
}
bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
"DELETE FROM segment_usage WHERE segment_id IN "
"(SELECT id FROM segments WHERE url_id = ?)"));
delete_usage.BindInt64(0, url_id);
if (!delete_usage.Run())
return false;
sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
"DELETE FROM segments WHERE url_id = ?"));
delete_seg.BindInt64(0, url_id);
return delete_seg.Run();
}
bool VisitSegmentDatabase::MigratePresentationIndex() {
sql::Transaction transaction(&GetDB());
return transaction.Begin() &&
GetDB().Execute("DROP TABLE presentation") &&
GetDB().Execute("CREATE TABLE segments_tmp ("
"id INTEGER PRIMARY KEY,"
"name VARCHAR,"
"url_id INTEGER NON NULL)") &&
GetDB().Execute("INSERT INTO segments_tmp SELECT "
"id, name, url_id FROM segments") &&
GetDB().Execute("DROP TABLE segments") &&
GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") &&
transaction.Commit();
}
} // namespace history