Source code for pahelix.utils.data_utils

#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
| Tools for data.
"""

import numpy as np
import os
import random


[docs]def save_data_list_to_npz(data_list, npz_file):
    """
    Save a list of data to the npz file. Each data is a dict 
    of numpy ndarray.

    Args:   
        data_list(list): a list of data.
        npz_file(str): the npz file location.
    """
    keys = data_list[0].keys()
    merged_data = {}
    for key in keys:
        if len(np.array(data_list[0][key]).shape) == 0:
            lens = np.ones(len(data_list)).astype('int')
            values = np.array([data[key] for data in data_list])
            singular = 1
        else:
            lens = np.array([len(data[key]) for data in data_list])
            values = np.concatenate([data[key] for data in data_list], 0)
            singular = 0
        merged_data[key] = values
        merged_data[key + '.seq_len'] = lens
        merged_data[key + '.singular'] = singular
    np.savez_compressed(npz_file, **merged_data)


[docs]def load_npz_to_data_list(npz_file):
    """
    Reload the data list save by ``save_data_list_to_npz``.

    Args:
        npz_file(str): the npz file location.

    Returns:
        a list of data where each data is a dict of numpy ndarray.
    """
    def _split_data(values, seq_lens, singular):
        res = []
        s = 0
        for l in seq_lens:
            if singular == 0:
                res.append(values[s: s + l])
            else:
                res.append(values[s])
            s += l
        return res

    merged_data = np.load(npz_file, allow_pickle=True)
    names = [name for name in merged_data.keys() 
            if not name.endswith('.seq_len') and not name.endswith('.singular')]
    data_dict = {}
    for name in names:
        data_dict[name] = _split_data(
                merged_data[name], 
                merged_data[name + '.seq_len'],
                merged_data[name + '.singular'])

    data_list = []
    n = len(data_dict[names[0]])
    for i in range(n):
        data = {name:data_dict[name][i] for name in names}
        data_list.append(data)
    return data_list


[docs]def get_part_files(data_path, trainer_id, trainer_num):
    """
    Split the files in data_path so that each trainer can train from different examples.
    """
    filenames = os.listdir(data_path)
    random.shuffle(filenames)
    part_filenames = []
    for (i, filename) in enumerate(filenames):
        if i % trainer_num == trainer_id:
            part_filenames.append(data_path + '/' + filename)
    return part_filenames