catapult/third_party/mapreduce/mapreduce/mapper_pipeline.py - platform/external/chromium-trace - Git at Google

 #!/usr/bin/env python
 # Copyright 2011 Google Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """Pipelines for mapreduce library."""


 __all__ = [
     "MapperPipeline",
     ]


 from mapreduce import control
 from mapreduce import model
 from mapreduce import parameters
 from mapreduce import pipeline_base

 # pylint: disable=g-bad-name
 # pylint: disable=protected-access


 class MapperPipeline(pipeline_base._OutputSlotsMixin,
                      pipeline_base.PipelineBase):
   """Pipeline wrapper for mapper job.

   Args:
     job_name: mapper job name as string
     handler_spec: mapper handler specification as string.
     input_reader_spec: input reader specification as string.
     output_writer_spec: output writer specification as string.
     params: mapper parameters for input reader and output writer as dict.
     shards: number of shards in the job as int.

   Returns:
     default: the list of filenames produced by the mapper if there was any
       output and the map was completed successfully.
     result_status: one of model.MapreduceState._RESULTS.
     job_id: mr id that can be used to query model.MapreduceState. Available
       immediately after run returns.
   """
   async = True

   # TODO(user): we probably want to output counters too.
   # Might also need to double filenames as named output.
   output_names = [
       # Job ID. MapreduceState.get_by_job_id can be used to load
       # mapreduce state.
       "job_id",
       # Dictionary of final counter values. Filled when job is completed.
       "counters"] + pipeline_base._OutputSlotsMixin.output_names

   def run(self,
           job_name,
           handler_spec,
           input_reader_spec,
           output_writer_spec=None,
           params=None,
           shards=None):
     """Start a mapreduce job.

     Args:
       job_name: mapreduce name. Only for display purpose.
       handler_spec: fully qualified name to your map function/class.
       input_reader_spec: fully qualified name to input reader class.
       output_writer_spec: fully qualified name to output writer class.
       params: a dictionary of parameters for input reader and output writer
         initialization.
       shards: number of shards. This provides a guide to mapreduce. The real
         number of shards is determined by how input are splited.
     """
     if shards is None:
       shards = parameters.config.SHARD_COUNT

     mapreduce_id = control.start_map(
         job_name,
         handler_spec,
         input_reader_spec,
         params or {},
         mapreduce_parameters={
             "done_callback": self.get_callback_url(),
             "done_callback_method": "GET",
             "pipeline_id": self.pipeline_id,
         },
         shard_count=shards,
         output_writer_spec=output_writer_spec,
         queue_name=self.queue_name,
         )
     self.fill(self.outputs.job_id, mapreduce_id)
     self.set_status(console_url="%s/detail?mapreduce_id=%s" % (
         (parameters.config.BASE_PATH, mapreduce_id)))

   def try_cancel(self):
     """Always allow mappers to be canceled and retried."""
     return True

   def callback(self):
     """Callback after this async pipeline finishes."""
     if self.was_aborted:
       return

     mapreduce_id = self.outputs.job_id.value
     mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
     if mapreduce_state.result_status != model.MapreduceState.RESULT_SUCCESS:
       self.retry("Job %s had status %s" % (
           mapreduce_id, mapreduce_state.result_status))
       return

     mapper_spec = mapreduce_state.mapreduce_spec.mapper
     outputs = []
     output_writer_class = mapper_spec.output_writer_class()
     if (output_writer_class and
         mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS):
       outputs = output_writer_class.get_filenames(mapreduce_state)

     self.fill(self.outputs.result_status, mapreduce_state.result_status)
     self.fill(self.outputs.counters, mapreduce_state.counters_map.to_dict())
     self.complete(outputs)
	#!/usr/bin/env python
	# Copyright 2011 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Pipelines for mapreduce library."""



	__all__ = [
	"MapperPipeline",
	]


	from mapreduce import control
	from mapreduce import model
	from mapreduce import parameters
	from mapreduce import pipeline_base

	# pylint: disable=g-bad-name
	# pylint: disable=protected-access


	class MapperPipeline(pipeline_base._OutputSlotsMixin,
	pipeline_base.PipelineBase):
	"""Pipeline wrapper for mapper job.

	Args:
	job_name: mapper job name as string
	handler_spec: mapper handler specification as string.
	input_reader_spec: input reader specification as string.
	output_writer_spec: output writer specification as string.
	params: mapper parameters for input reader and output writer as dict.
	shards: number of shards in the job as int.

	Returns:
	default: the list of filenames produced by the mapper if there was any
	output and the map was completed successfully.
	result_status: one of model.MapreduceState._RESULTS.
	job_id: mr id that can be used to query model.MapreduceState. Available
	immediately after run returns.
	"""
	async = True

	# TODO(user): we probably want to output counters too.
	# Might also need to double filenames as named output.
	output_names = [
	# Job ID. MapreduceState.get_by_job_id can be used to load
	# mapreduce state.
	"job_id",
	# Dictionary of final counter values. Filled when job is completed.
	"counters"] + pipeline_base._OutputSlotsMixin.output_names

	def run(self,
	job_name,
	handler_spec,
	input_reader_spec,
	output_writer_spec=None,
	params=None,
	shards=None):
	"""Start a mapreduce job.

	Args:
	job_name: mapreduce name. Only for display purpose.
	handler_spec: fully qualified name to your map function/class.
	input_reader_spec: fully qualified name to input reader class.
	output_writer_spec: fully qualified name to output writer class.
	params: a dictionary of parameters for input reader and output writer
	initialization.
	shards: number of shards. This provides a guide to mapreduce. The real
	number of shards is determined by how input are splited.
	"""
	if shards is None:
	shards = parameters.config.SHARD_COUNT

	mapreduce_id = control.start_map(
	job_name,
	handler_spec,
	input_reader_spec,
	params or {},
	mapreduce_parameters={
	"done_callback": self.get_callback_url(),
	"done_callback_method": "GET",
	"pipeline_id": self.pipeline_id,
	},
	shard_count=shards,
	output_writer_spec=output_writer_spec,
	queue_name=self.queue_name,
	)
	self.fill(self.outputs.job_id, mapreduce_id)
	self.set_status(console_url="%s/detail?mapreduce_id=%s" % (
	(parameters.config.BASE_PATH, mapreduce_id)))

	def try_cancel(self):
	"""Always allow mappers to be canceled and retried."""
	return True

	def callback(self):
	"""Callback after this async pipeline finishes."""
	if self.was_aborted:
	return

	mapreduce_id = self.outputs.job_id.value
	mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
	if mapreduce_state.result_status != model.MapreduceState.RESULT_SUCCESS:
	self.retry("Job %s had status %s" % (
	mapreduce_id, mapreduce_state.result_status))
	return

	mapper_spec = mapreduce_state.mapreduce_spec.mapper
	outputs = []
	output_writer_class = mapper_spec.output_writer_class()
	if (output_writer_class and
	mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS):
	outputs = output_writer_class.get_filenames(mapreduce_state)

	self.fill(self.outputs.result_status, mapreduce_state.result_status)
	self.fill(self.outputs.counters, mapreduce_state.counters_map.to_dict())
	self.complete(outputs)