Source code for kwarray.algo_setcover

# -*- coding: utf-8 -*-
"""
Algorithms to find a solution to the setcover problem

SeeAlso:
    https://github.com/keon/algorithms
"""
import ubelt as ub
import itertools as it
from collections import OrderedDict



[docs]
def setcover(candidate_sets_dict, items=None, set_weights=None,
             item_values=None, max_weight=None, algo='approx'):
    """
    Finds a feasible solution to the minimum weight maximum value set cover.
    The quality and runtime of the solution will depend on the backend
    algorithm selected.

    Args:
        candidate_sets_dict (Dict[KT, List[VT]]):
            a dictionary where keys are the candidate set ids and each value is
            a candidate cover set.

        items (Optional[VT]): the set of all items to be covered,
            if not specified, it is infered from the candidate cover sets

        set_weights (Optional[Dict[KT, float]]):
            maps candidate set ids to a cost for using this candidate cover in
            the solution. If not specified the weight of each candiate cover
            defaults to 1.

        item_values (Optional[Dict[VT, float]]):
            maps each item to a value we get for returning this item in the
            solution. If not specified the value of each item defaults to 1.

        max_weight (Optional[float]): if specified, the total cost of the
            returned cover is constrained to be less than this number.

        algo (str): specifies which algorithm to use. Can either be
            'approx' for the greedy solution or 'exact' for the globally
            optimal solution. Note the 'exact' algorithm solves an
            integer-linear-program, which can be very slow and requires
            the `pulp` package to be installed.

    Returns:
        Dict: a subdict of candidate_sets_dict containing the chosen solution.

    Example:
        >>> candidate_sets_dict = {
        >>>     'a': [1, 2, 3, 8, 9, 0],
        >>>     'b': [1, 2, 3, 4, 5],
        >>>     'c': [4, 5, 7],
        >>>     'd': [5, 6, 7],
        >>>     'e': [6, 7, 8, 9, 0],
        >>> }
        >>> greedy_soln = setcover(candidate_sets_dict, algo='greedy')
        >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0)))
        greedy_soln = {'a': [1, 2, 3, 8, 9, 0], 'c': [4, 5, 7], 'd': [5, 6, 7]}
        >>> # xdoc: +REQUIRES(module:pulp)
        >>> exact_soln = setcover(candidate_sets_dict, algo='exact')
        >>> print('exact_soln = {}'.format(ub.urepr(exact_soln, nl=0)))
        exact_soln = {'b': [1, 2, 3, 4, 5], 'e': [6, 7, 8, 9, 0]}
    """
    if algo in ['approx', 'greedy']:
        return _setcover_greedy_old(candidate_sets_dict, items=items,
                                    set_weights=set_weights,
                                    item_values=item_values,
                                    max_weight=max_weight)
    elif algo in ['exact', 'ilp']:
        return _setcover_ilp(candidate_sets_dict, items=items,
                             set_weights=set_weights, item_values=item_values,
                             max_weight=max_weight)
    else:
        raise KeyError(algo)




[docs]
def _setcover_greedy_old(candidate_sets_dict, items=None, set_weights=None,
                         item_values=None, max_weight=None):
    """
    Benchmark:
        items = np.arange(10000)
        candidate_sets_dict = {}
        for i in range(1000):
            candidate_sets_dict[i] = np.random.choice(items, 200).tolist()

        _setcover_greedy_new(candidate_sets_dict) == _setcover_greedy_old(candidate_sets_dict)
        _ = nh.util.profile_onthefly(_setcover_greedy_new)(candidate_sets_dict)
        _ = nh.util.profile_onthefly(_setcover_greedy_old)(candidate_sets_dict)

        import ubelt as ub
        for timer in ub.Timerit(3, bestof=1, label='time'):
            with timer:
                len(_setcover_greedy_new(candidate_sets_dict))

        import ubelt as ub
        for timer in ub.Timerit(3, bestof=1, label='time'):
            with timer:
                len(_setcover_greedy_old(candidate_sets_dict))
    """
    solution_cover = {}

    if len(candidate_sets_dict) == 0:
        # O(1) optimal solution, we did it!
        return solution_cover

    # If candset_weights or item_values not given use the length as defaults
    if set_weights is None:
        get_weight = len
    else:
        def get_weight(solution_cover):
            return sum(set_weights[key] for key in solution_cover.keys())
    if item_values is None:
        get_value = len
    else:
        def get_value(vals):
            return sum(item_values[v] for v in vals)
    if max_weight is None:
        max_weight = get_weight(candidate_sets_dict)

    avail_covers = {key: set(val) for key, val in candidate_sets_dict.items()}
    avail_keys, avail_vals = zip(*sorted(avail_covers.items()))
    avail_keys = list(avail_keys)
    avail_vals = list(avail_vals)

    # While we still need covers
    while get_weight(solution_cover) < max_weight and len(avail_keys) > 0:
        # Find candiate set with the most uncovered items
        uncovered_values = list(map(get_value, avail_vals))
        chosen_idx = ub.argmax(uncovered_values)
        if uncovered_values[chosen_idx] <= 0:
            # needlessly adding value-less items
            break
        chosen_key = avail_keys[chosen_idx]
        # Add values in this key to the cover
        chosen_set = avail_covers[chosen_key]
        solution_cover[chosen_key] = candidate_sets_dict[chosen_key]
        # Remove chosen set from available options and covered items
        # from remaining available sets
        del avail_keys[chosen_idx]
        del avail_vals[chosen_idx]
        for vals in avail_vals:
            vals.difference_update(chosen_set)
    return solution_cover




[docs]
def _setcover_greedy_new(candidate_sets_dict, items=None, set_weights=None,
                         item_values=None, max_weight=None):
    """
    Implements Johnson's / Chvatal's greedy set-cover approximation algorithm.

    The approximation gaurentees depend on specifications of set weights and
    item values

    Running time:
        N = number of universe items
        C = number of candidate covering sets

        Worst case running time is: O(C^2 * CN)
            (note this is via simple analysis, the big-oh might be better)

    Set Cover: log(len(items) + 1) approximation algorithm
    Weighted Maximum Cover: 1 - 1/e == .632 approximation algorithm
    Generalized maximum coverage is not implemented [WikiMaxCov]_.

    References:
        .. [WikiMaxCov] https://en.wikipedia.org/wiki/Maximum_coverage_problem

    Ignore:
        # pip install git+git://github.com/tangentlabs/django-oscar.git#egg=django-oscar.
        # TODO: wrap https://github.com/martin-steinegger/setcover/blob/master/SetCover.cpp
        # pip install SetCoverPy
        # This is actually much slower than my implementation
        from SetCoverPy import setcover
        g = setcover.SetCover(full_overlaps, cost=np.ones(len(full_overlaps)))
        g.greedy()
        keep = np.where(g.s)[0]

    Example:
        >>> candidate_sets_dict = {
        >>>     'a': [1, 2, 3, 8, 9, 0],
        >>>     'b': [1, 2, 3, 4, 5],
        >>>     'c': [4, 5, 7],
        >>>     'd': [5, 6, 7],
        >>>     'e': [6, 7, 8, 9, 0],
        >>> }
        >>> greedy_soln = _setcover_greedy_new(candidate_sets_dict)
        >>> #print(repr(greedy_soln))
        ...
        >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0)))
        greedy_soln = {'a': [1, 2, 3, 8, 9, 0], 'c': [4, 5, 7], 'd': [5, 6, 7]}

    Example:
        >>> candidate_sets_dict = {
        >>>     'a': [1, 2, 3, 8, 9, 0],
        >>>     'b': [1, 2, 3, 4, 5],
        >>>     'c': [4, 5, 7],
        >>>     'd': [5, 6, 7],
        >>>     'e': [6, 7, 8, 9, 0],
        >>> }
        >>> items = list(set(it.chain(*candidate_sets_dict.values())))
        >>> set_weights = {i: 1 for i in candidate_sets_dict.keys()}
        >>> item_values = {e: 1 for e in items}
        >>> greedy_soln = _setcover_greedy_new(candidate_sets_dict,
        >>>                             item_values=item_values,
        >>>                             set_weights=set_weights)
        >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0)))
        greedy_soln = {'a': [1, 2, 3, 8, 9, 0], 'c': [4, 5, 7], 'd': [5, 6, 7]}

    Example:
        >>> candidate_sets_dict = {}
        >>> greedy_soln = _setcover_greedy_new(candidate_sets_dict)
        >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0)))
        greedy_soln = {}
    """
    if len(candidate_sets_dict) == 0:
        # O(1) optimal solution, we did it!
        return {}

    solution_cover = {}
    solution_weight = 0

    if items is None:
        items = list(set(it.chain(*candidate_sets_dict.values())))

    # Inverted index
    item_to_keys = {item: set() for item in items}
    # This is actually a fair bit faster than the non-comprehension version
    [item_to_keys[item].add(key)
     for key, vals in candidate_sets_dict.items()
     for item in vals]

    # If set_weights or item_values not given use the length as defaults
    if set_weights is None:
        get_weight = len
    else:
        # TODO: we can improve this with bookkeeping
        def get_weight(solution_cover):
            return sum(set_weights[key] for key in solution_cover.keys())

    if item_values is None:
        get_value = len
    else:
        def get_value(vals):
            return sum(item_values[v] for v in vals)
    if max_weight is None:
        max_weight = get_weight(candidate_sets_dict)

    avail_covers = OrderedDict([
        (key, set(vals))
        for key, vals in sorted(candidate_sets_dict.items())
    ])
    avail_totals = OrderedDict([
        (key, get_value(vals))
        for key, vals in avail_covers.items()
    ])

    print('avail_covers = {}'.format(ub.urepr(avail_covers, nl=1)))
    print('avail_totals = {}'.format(ub.urepr(avail_totals, nl=1)))

    # While we still need covers
    while solution_weight < max_weight and len(avail_covers) > 0:
        # Find candiate set with the most valuable uncovered items
        chosen_key = ub.argmax(avail_totals)
        if avail_totals[chosen_key] <= 0:
            # needlessly adding value-less covering set
            break

        print('-----')
        print('CHOOSE COVER SET = {!r}'.format(chosen_key))

        # Add values in this key to the cover
        chosen_items = avail_covers[chosen_key]
        solution_cover[chosen_key] = candidate_sets_dict[chosen_key]

        # Update the solution weight
        chosen_weight = (1 if set_weights is None else set_weights[chosen_key])
        solution_weight += chosen_weight

        # Remove chosen covering set from available options
        del avail_covers[chosen_key]
        del avail_totals[chosen_key]

        # For each chosen item, find the other sets that it belongs to
        modified_keys = set()
        for item in chosen_items:
            # Update the inverted index
            new_keys = item_to_keys[item]
            new_keys.remove(chosen_key)
            item_to_keys[item] = new_keys
            # And mark the non-chosen reamining cover sets as modified
            modified_keys.update(new_keys)
        # Then update and recompute the value of the modified sets
        for key in modified_keys:
            avail_covers[key].difference_update(chosen_items)
            newval = get_value(avail_covers[key])
            avail_totals[key] = newval

        print('avail_covers = {}'.format(ub.urepr(avail_covers, nl=1)))
        print('avail_totals = {}'.format(ub.urepr(avail_totals, nl=1)))

    print('solution_cover = {!r}'.format(solution_cover))
    return solution_cover




[docs]
def _setcover_ilp(candidate_sets_dict, items=None, set_weights=None,
                  item_values=None, max_weight=None, verbose=False):
    """
    Set cover / Weighted Maximum Cover exact algorithm using an integer linear
    program.

    TODO:
        - [ ] Use CPLEX solver if available

    Example:
        >>> # xdoc: +REQUIRES(module:pulp)
        >>> candidate_sets_dict = {}
        >>> exact_soln = _setcover_ilp(candidate_sets_dict)
        >>> print('exact_soln = {}'.format(ub.urepr(exact_soln, nl=0)))
        exact_soln = {}

    Example:
        >>> # xdoc: +REQUIRES(module:pulp)
        >>> candidate_sets_dict = {
        >>>     'a': [1, 2, 3, 8, 9, 0],
        >>>     'b': [1, 2, 3, 4, 5],
        >>>     'c': [4, 5, 7],
        >>>     'd': [5, 6, 7],
        >>>     'e': [6, 7, 8, 9, 0],
        >>> }
        >>> items = list(set(it.chain(*candidate_sets_dict.values())))
        >>> set_weights = {i: 1 for i in candidate_sets_dict.keys()}
        >>> item_values = {e: 1 for e in items}
        >>> exact_soln1 = _setcover_ilp(candidate_sets_dict,
        >>>                             item_values=item_values,
        >>>                             set_weights=set_weights)
        >>> exact_soln2 = _setcover_ilp(candidate_sets_dict)
        >>> assert exact_soln1 == exact_soln2
    """
    try:
        import pulp
    except ImportError:
        print('ERROR: must install pulp to use ILP setcover solver')
        raise

    if len(candidate_sets_dict) == 0:
        return {}

    if items is None:
        items = list(set(it.chain(*candidate_sets_dict.values())))

    if item_values is None and set_weights is None and max_weight is None:
        # This is the most basic set cover problem
        # Formulate integer program
        prob = pulp.LpProblem("Set Cover", pulp.LpMinimize)
        # Solution variable indicates if set it chosen or not
        set_indices = candidate_sets_dict.keys()
        x = pulp.LpVariable.dicts(name='x', indexs=set_indices,
                                  lowBound=0, upBound=1, cat=pulp.LpInteger)
        # minimize the number of sets
        prob.objective = sum(x[i] for i in set_indices)
        # subject to
        for e in items:
            # each element is covered
            containing_sets = [i for i in set_indices if e in candidate_sets_dict[i]]
            prob.add(sum(x[i] for i in containing_sets) >= 1)
        # Solve using with solver like CPLEX, GLPK, or SCIP.
        #pulp.CPLEX().solve(prob)
        pulp.PULP_CBC_CMD().solve(prob)
        # Read solution
        solution_keys = [i for i in set_indices if x[i].varValue]
        solution_cover = {i: candidate_sets_dict[i] for i in solution_keys}
        # Print summary
        if verbose:
            print(prob)
            print('OPT:')
            print('\n'.join(['    %s = %s' % (x[i].name, x[i].varValue) for i in set_indices]))
            print('solution_cover = %r' % (solution_cover,))
    else:
        if set_weights is None:
            set_weights = {i: 1 for i in candidate_sets_dict.keys()}
        if item_values is None:
            item_values = {e: 1 for e in items}
        if max_weight is None:
            max_weight = sum(set_weights[i] for i in candidate_sets_dict.keys())
        prob = pulp.LpProblem("Maximum Cover", pulp.LpMaximize)
        # Solution variable indicates if set it chosen or not
        item_indicies = items
        set_indices = candidate_sets_dict.keys()
        x = pulp.LpVariable.dicts(name='x', indexs=set_indices,
                                  lowBound=0, upBound=1, cat=pulp.LpInteger)
        y = pulp.LpVariable.dicts(name='y', indexs=item_indicies,
                                  lowBound=0, upBound=1, cat=pulp.LpInteger)
        r = pulp.LpVariable.dicts(name='r', indexs=item_indicies)
        # maximize the value of the covered items
        primary_objective = sum(item_values[e] * y[e] for e in item_indicies)
        # minimize the number of sets used (make sure it does not influence the chosen primary objective)
        # This is only possible when values are non-negative
        # TODO: minimize redundency
        min_influence = min(item_values.values())
        secondary_weight = min_influence / (1.1 * len(set_indices))
        secondary_objective = (sum(-x[i] for i in set_indices)) * secondary_weight
        #
        prob.objective = primary_objective + secondary_objective
        # subject to
        # no more than the maximum weight
        prob.add(sum(x[i] * set_weights[i] for i in set_indices) <= max_weight)
        # If an item is chosen than at least one set containing it is chosen
        for e in item_indicies:
            containing_sets = [i for i in set_indices if e in candidate_sets_dict[i]]
            if len(containing_sets) > 0:
                prob.add(sum(x[i] for i in containing_sets) >= y[e])
                # record number of times each item is covered
                prob.add(sum(x[i] for i in containing_sets) == r[e])
        # Solve using with solver like CPLEX, GLPK, or SCIP.
        #pulp.CPLEX().solve(prob)
        pulp.PULP_CBC_CMD().solve(prob)
        # Read solution
        solution_keys = [i for i in set_indices if x[i].varValue]
        solution_cover = {i: candidate_sets_dict[i] for i in solution_keys}
        # Print summary
        if verbose:
            print(prob)
            print('OPT:')
            print('\n'.join(['    %s = %s' % (x[i].name, x[i].varValue) for i in set_indices]))
            print('\n'.join(['    %s = %s' % (y[i].name, y[i].varValue) for i in item_indicies]))
            print('solution_cover = %r' % (solution_cover,))
    return solution_cover