blob: c5a34cc977f4e559ed0d271edd3f76f0731ce62f [file] [log] [blame]
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import csv
import json
import os
from import cloud_storage
from import page as page_module
from import page_set_archive_info
class PageSet(object):
def __init__(self, file_path='', attributes=None):
self.file_path = file_path
# These attributes can be set dynamically by the page set.
self.description = ''
self.archive_data_file = ''
self.credentials_path = None
self.user_agent_type = None
self.make_javascript_deterministic = True
self.navigate_steps = {'action': 'navigate'}
if attributes:
for k, v in attributes.iteritems():
setattr(self, k, v)
# Create a PageSetArchiveInfo object.
if self.archive_data_file:
self.wpr_archive_info = page_set_archive_info.PageSetArchiveInfo.FromFile(
os.path.join(self._base_dir, self.archive_data_file), file_path)
self.wpr_archive_info = None
# Create a Page object for every page.
self.pages = []
if attributes and 'pages' in attributes:
for page_attributes in attributes['pages']:
url = page_attributes.pop('url')
page = page_module.Page(
url, self, attributes=page_attributes, base_dir=self._base_dir)
# Prepend _base_dir to our serving dirs.
# Always use realpath to ensure no duplicates in set.
self.serving_dirs = set()
if attributes and 'serving_dirs' in attributes:
if not isinstance(attributes['serving_dirs'], list):
raise ValueError('serving_dirs must be a list.')
for serving_dir in attributes['serving_dirs']:
os.path.realpath(os.path.join(self._base_dir, serving_dir)))
# Attempt to download the credentials file.
if self.credentials_path:
os.path.join(self._base_dir, self.credentials_path))
# Scan every serving directory for .sha1 files
# and download them from Cloud Storage. Assume all data is public.
all_serving_dirs = self.serving_dirs.copy()
# Add individual page dirs to all serving dirs.
for page in self:
if page.is_file:
# Scan all serving dirs.
for serving_dir in all_serving_dirs:
if serving_dir == '/':
raise ValueError('Trying to serve "/" from HTTP server.')
for dirpath, _, filenames in os.walk(serving_dir):
for filename in filenames:
path, extension = os.path.splitext(
os.path.join(dirpath, filename))
if extension != '.sha1':
cloud_storage.GetIfChanged(cloud_storage.PUBLIC_BUCKET, path)
def FromFile(cls, file_path):
with open(file_path, 'r') as f:
contents =
data = json.loads(contents)
return cls.FromDict(data, file_path)
def FromDict(cls, data, file_path):
return cls(file_path, data)
def _base_dir(self):
if os.path.isfile(self.file_path):
return os.path.dirname(self.file_path)
return self.file_path
def ContainsOnlyFileURLs(self):
for page in self.pages:
if not page.is_file:
return False
return True
def ReorderPageSet(self, results_file):
"""Reorders this page set based on the results of a past run."""
page_set_dict = {}
for page in self.pages:
page_set_dict[page.url] = page
pages = []
with open(results_file, 'rb') as csv_file:
csv_reader = csv.reader(csv_file)
csv_header =
if 'url' not in csv_header:
raise Exception('Unusable results_file.')
url_index = csv_header.index('url')
for csv_row in csv_reader:
if csv_row[url_index] in page_set_dict:
raise Exception('Unusable results_file.')
return pages
def WprFilePathForPage(self, page):
if not self.wpr_archive_info:
return None
return self.wpr_archive_info.WprFilePathForPage(page)
def __iter__(self):
return self.pages.__iter__()
def __len__(self):
return len(self.pages)
def __getitem__(self, key):
return self.pages[key]
def __setitem__(self, key, value):
self.pages[key] = value