| # Copyright 2015-2017 ARM Limited |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| import pandas as pd |
| import numpy as np |
| |
| """Generic functions that can be used in multiple places in trappy |
| """ |
| |
| def listify(to_select): |
| """Utitlity function to handle both single and |
| list inputs |
| """ |
| |
| if not isinstance(to_select, list): |
| to_select = [to_select] |
| |
| return to_select |
| |
| def handle_duplicate_index(data, |
| max_delta=0.000001): |
| """Handle duplicate values in index |
| |
| :param data: The timeseries input |
| :type data: :mod:`pandas.Series` |
| |
| :param max_delta: Maximum interval adjustment value that |
| will be added to duplicate indices |
| :type max_delta: float |
| |
| Consider the following case where a series needs to be reindexed |
| to a new index (which can be required when different series need to |
| be combined and compared): |
| :: |
| |
| import pandas |
| values = [0, 1, 2, 3, 4] |
| index = [0.0, 1.0, 1.0, 6.0, 7.0] |
| series = pandas.Series(values, index=index) |
| new_index = [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0] |
| series.reindex(new_index) |
| |
| The above code fails with: |
| :: |
| |
| ValueError: cannot reindex from a duplicate axis |
| |
| The function :func:`handle_duplicate_axis` changes the duplicate values |
| to |
| :: |
| |
| >>> import pandas |
| >>> from trappy.utils import handle_duplicate_index |
| |
| >>> values = [0, 1, 2, 3, 4] |
| index = [0.0, 1.0, 1.0, 6.0, 7.0] |
| series = pandas.Series(values, index=index) |
| series = handle_duplicate_index(series) |
| print series.index.values |
| >>> [ 0. 1. 1.000001 6. 7. ] |
| |
| """ |
| |
| index = data.index |
| new_index = index.values |
| |
| dups = index.get_duplicates() |
| |
| for dup in dups: |
| # Leave one of the values intact |
| dup_index_left = index.searchsorted(dup, side="left") |
| dup_index_right = index.searchsorted(dup, side="right") - 1 |
| num_dups = dup_index_right - dup_index_left + 1 |
| |
| # Calculate delta that needs to be added to each duplicate |
| # index |
| try: |
| delta = (index[dup_index_right + 1] - dup) / num_dups |
| except IndexError: |
| # dup_index_right + 1 is outside of the series (i.e. the |
| # dup is at the end of the series). |
| delta = max_delta |
| |
| # Clamp the maximum delta added to max_delta |
| if delta > max_delta: |
| delta = max_delta |
| |
| # Add a delta to the others |
| dup_index_left += 1 |
| while dup_index_left <= dup_index_right: |
| new_index[dup_index_left] += delta |
| delta += delta |
| dup_index_left += 1 |
| |
| return data.reindex(new_index) |
| |
| # Iterate fast over all rows in a data frame and apply fn |
| def apply_callback(df, fn, *kwargs): |
| iters = df.itertuples() |
| event_tuple = iters.next() |
| |
| # Column names beginning with underscore will not be preserved in tuples |
| # due to constraints on namedtuple field names, so store mappings from |
| # column name to column number for each trace event. |
| col_idxs = { name: idx for idx, name in enumerate(['Time'] + df.columns.tolist()) } |
| |
| while True: |
| if not event_tuple: |
| break |
| event_dict = { col: event_tuple[idx] for col, idx in col_idxs.iteritems() } |
| |
| if kwargs: |
| fn(event_dict, kwargs) |
| else: |
| fn(event_dict) |
| |
| event_tuple = next(iters, None) |
| |
| |
| def merge_dfs(pr_df, sec_df, pivot): |
| # Keep track of last secondary event |
| pivot_map = {} |
| |
| # An array accumating dicts with merged data |
| merged_data = [] |
| def df_fn(data): |
| # Store the latest secondary info |
| if data['Time'][0] == 'secondary': |
| pivot_map[data[pivot]] = data |
| # Get rid of primary/secondary labels |
| data['Time'] = data['Time'][1] |
| return |
| |
| # Propogate latest secondary info |
| for key, value in data.iteritems(): |
| if key == pivot: |
| continue |
| # Fast check for if value is nan (faster than np.isnan + try/except) |
| if value != value and pivot_map.has_key(data[pivot]): |
| data[key] = pivot_map[data[pivot]][key] |
| |
| # Get rid of primary/secondary labels |
| data['Time'] = data['Time'][1] |
| merged_data.append(data) |
| |
| df = pd.concat([pr_df, sec_df], keys=['primary', 'secondary']).sort(columns='__line') |
| apply_callback(df, df_fn) |
| merged_df = pd.DataFrame.from_dict(merged_data) |
| merged_df.set_index('Time', inplace=True) |
| |
| return merged_df |