Source code for kwarray.algo_setcover

# -*- coding: utf-8 -*-
"""
Algorithms to find a solution to the setcover problem

SeeAlso:
    https://github.com/keon/algorithms
"""
import ubelt as ub
import itertools as it
from collections import OrderedDict


[docs] def setcover(candidate_sets_dict, items=None, set_weights=None, item_values=None, max_weight=None, algo='approx'): """ Finds a feasible solution to the minimum weight maximum value set cover. The quality and runtime of the solution will depend on the backend algorithm selected. Args: candidate_sets_dict (Dict[KT, List[VT]]): a dictionary where keys are the candidate set ids and each value is a candidate cover set. items (Optional[VT]): the set of all items to be covered, if not specified, it is infered from the candidate cover sets set_weights (Optional[Dict[KT, float]]): maps candidate set ids to a cost for using this candidate cover in the solution. If not specified the weight of each candiate cover defaults to 1. item_values (Optional[Dict[VT, float]]): maps each item to a value we get for returning this item in the solution. If not specified the value of each item defaults to 1. max_weight (Optional[float]): if specified, the total cost of the returned cover is constrained to be less than this number. algo (str): specifies which algorithm to use. Can either be 'approx' for the greedy solution or 'exact' for the globally optimal solution. Note the 'exact' algorithm solves an integer-linear-program, which can be very slow and requires the `pulp` package to be installed. Returns: Dict: a subdict of candidate_sets_dict containing the chosen solution. Example: >>> candidate_sets_dict = { >>> 'a': [1, 2, 3, 8, 9, 0], >>> 'b': [1, 2, 3, 4, 5], >>> 'c': [4, 5, 7], >>> 'd': [5, 6, 7], >>> 'e': [6, 7, 8, 9, 0], >>> } >>> greedy_soln = setcover(candidate_sets_dict, algo='greedy') >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0))) greedy_soln = {'a': [1, 2, 3, 8, 9, 0], 'c': [4, 5, 7], 'd': [5, 6, 7]} >>> # xdoc: +REQUIRES(module:pulp) >>> exact_soln = setcover(candidate_sets_dict, algo='exact') >>> print('exact_soln = {}'.format(ub.urepr(exact_soln, nl=0))) exact_soln = {'b': [1, 2, 3, 4, 5], 'e': [6, 7, 8, 9, 0]} """ if algo in ['approx', 'greedy']: return _setcover_greedy_old(candidate_sets_dict, items=items, set_weights=set_weights, item_values=item_values, max_weight=max_weight) elif algo in ['exact', 'ilp']: return _setcover_ilp(candidate_sets_dict, items=items, set_weights=set_weights, item_values=item_values, max_weight=max_weight) else: raise KeyError(algo)
[docs] def _setcover_greedy_old(candidate_sets_dict, items=None, set_weights=None, item_values=None, max_weight=None): """ Benchmark: items = np.arange(10000) candidate_sets_dict = {} for i in range(1000): candidate_sets_dict[i] = np.random.choice(items, 200).tolist() _setcover_greedy_new(candidate_sets_dict) == _setcover_greedy_old(candidate_sets_dict) _ = nh.util.profile_onthefly(_setcover_greedy_new)(candidate_sets_dict) _ = nh.util.profile_onthefly(_setcover_greedy_old)(candidate_sets_dict) import ubelt as ub for timer in ub.Timerit(3, bestof=1, label='time'): with timer: len(_setcover_greedy_new(candidate_sets_dict)) import ubelt as ub for timer in ub.Timerit(3, bestof=1, label='time'): with timer: len(_setcover_greedy_old(candidate_sets_dict)) """ solution_cover = {} if len(candidate_sets_dict) == 0: # O(1) optimal solution, we did it! return solution_cover # If candset_weights or item_values not given use the length as defaults if set_weights is None: get_weight = len else: def get_weight(solution_cover): return sum(set_weights[key] for key in solution_cover.keys()) if item_values is None: get_value = len else: def get_value(vals): return sum(item_values[v] for v in vals) if max_weight is None: max_weight = get_weight(candidate_sets_dict) avail_covers = {key: set(val) for key, val in candidate_sets_dict.items()} avail_keys, avail_vals = zip(*sorted(avail_covers.items())) avail_keys = list(avail_keys) avail_vals = list(avail_vals) # While we still need covers while get_weight(solution_cover) < max_weight and len(avail_keys) > 0: # Find candiate set with the most uncovered items uncovered_values = list(map(get_value, avail_vals)) chosen_idx = ub.argmax(uncovered_values) if uncovered_values[chosen_idx] <= 0: # needlessly adding value-less items break chosen_key = avail_keys[chosen_idx] # Add values in this key to the cover chosen_set = avail_covers[chosen_key] solution_cover[chosen_key] = candidate_sets_dict[chosen_key] # Remove chosen set from available options and covered items # from remaining available sets del avail_keys[chosen_idx] del avail_vals[chosen_idx] for vals in avail_vals: vals.difference_update(chosen_set) return solution_cover
[docs] def _setcover_greedy_new(candidate_sets_dict, items=None, set_weights=None, item_values=None, max_weight=None): """ Implements Johnson's / Chvatal's greedy set-cover approximation algorithm. The approximation gaurentees depend on specifications of set weights and item values Running time: N = number of universe items C = number of candidate covering sets Worst case running time is: O(C^2 * CN) (note this is via simple analysis, the big-oh might be better) Set Cover: log(len(items) + 1) approximation algorithm Weighted Maximum Cover: 1 - 1/e == .632 approximation algorithm Generalized maximum coverage is not implemented [WikiMaxCov]_. References: .. [WikiMaxCov] https://en.wikipedia.org/wiki/Maximum_coverage_problem Ignore: # pip install git+git://github.com/tangentlabs/django-oscar.git#egg=django-oscar. # TODO: wrap https://github.com/martin-steinegger/setcover/blob/master/SetCover.cpp # pip install SetCoverPy # This is actually much slower than my implementation from SetCoverPy import setcover g = setcover.SetCover(full_overlaps, cost=np.ones(len(full_overlaps))) g.greedy() keep = np.where(g.s)[0] Example: >>> candidate_sets_dict = { >>> 'a': [1, 2, 3, 8, 9, 0], >>> 'b': [1, 2, 3, 4, 5], >>> 'c': [4, 5, 7], >>> 'd': [5, 6, 7], >>> 'e': [6, 7, 8, 9, 0], >>> } >>> greedy_soln = _setcover_greedy_new(candidate_sets_dict) >>> #print(repr(greedy_soln)) ... >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0))) greedy_soln = {'a': [1, 2, 3, 8, 9, 0], 'c': [4, 5, 7], 'd': [5, 6, 7]} Example: >>> candidate_sets_dict = { >>> 'a': [1, 2, 3, 8, 9, 0], >>> 'b': [1, 2, 3, 4, 5], >>> 'c': [4, 5, 7], >>> 'd': [5, 6, 7], >>> 'e': [6, 7, 8, 9, 0], >>> } >>> items = list(set(it.chain(*candidate_sets_dict.values()))) >>> set_weights = {i: 1 for i in candidate_sets_dict.keys()} >>> item_values = {e: 1 for e in items} >>> greedy_soln = _setcover_greedy_new(candidate_sets_dict, >>> item_values=item_values, >>> set_weights=set_weights) >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0))) greedy_soln = {'a': [1, 2, 3, 8, 9, 0], 'c': [4, 5, 7], 'd': [5, 6, 7]} Example: >>> candidate_sets_dict = {} >>> greedy_soln = _setcover_greedy_new(candidate_sets_dict) >>> print('greedy_soln = {}'.format(ub.urepr(greedy_soln, nl=0))) greedy_soln = {} """ if len(candidate_sets_dict) == 0: # O(1) optimal solution, we did it! return {} solution_cover = {} solution_weight = 0 if items is None: items = list(set(it.chain(*candidate_sets_dict.values()))) # Inverted index item_to_keys = {item: set() for item in items} # This is actually a fair bit faster than the non-comprehension version [item_to_keys[item].add(key) for key, vals in candidate_sets_dict.items() for item in vals] # If set_weights or item_values not given use the length as defaults if set_weights is None: get_weight = len else: # TODO: we can improve this with bookkeeping def get_weight(solution_cover): return sum(set_weights[key] for key in solution_cover.keys()) if item_values is None: get_value = len else: def get_value(vals): return sum(item_values[v] for v in vals) if max_weight is None: max_weight = get_weight(candidate_sets_dict) avail_covers = OrderedDict([ (key, set(vals)) for key, vals in sorted(candidate_sets_dict.items()) ]) avail_totals = OrderedDict([ (key, get_value(vals)) for key, vals in avail_covers.items() ]) print('avail_covers = {}'.format(ub.urepr(avail_covers, nl=1))) print('avail_totals = {}'.format(ub.urepr(avail_totals, nl=1))) # While we still need covers while solution_weight < max_weight and len(avail_covers) > 0: # Find candiate set with the most valuable uncovered items chosen_key = ub.argmax(avail_totals) if avail_totals[chosen_key] <= 0: # needlessly adding value-less covering set break print('-----') print('CHOOSE COVER SET = {!r}'.format(chosen_key)) # Add values in this key to the cover chosen_items = avail_covers[chosen_key] solution_cover[chosen_key] = candidate_sets_dict[chosen_key] # Update the solution weight chosen_weight = (1 if set_weights is None else set_weights[chosen_key]) solution_weight += chosen_weight # Remove chosen covering set from available options del avail_covers[chosen_key] del avail_totals[chosen_key] # For each chosen item, find the other sets that it belongs to modified_keys = set() for item in chosen_items: # Update the inverted index new_keys = item_to_keys[item] new_keys.remove(chosen_key) item_to_keys[item] = new_keys # And mark the non-chosen reamining cover sets as modified modified_keys.update(new_keys) # Then update and recompute the value of the modified sets for key in modified_keys: avail_covers[key].difference_update(chosen_items) newval = get_value(avail_covers[key]) avail_totals[key] = newval print('avail_covers = {}'.format(ub.urepr(avail_covers, nl=1))) print('avail_totals = {}'.format(ub.urepr(avail_totals, nl=1))) print('solution_cover = {!r}'.format(solution_cover)) return solution_cover
[docs] def _setcover_ilp(candidate_sets_dict, items=None, set_weights=None, item_values=None, max_weight=None, verbose=False): """ Set cover / Weighted Maximum Cover exact algorithm using an integer linear program. TODO: - [ ] Use CPLEX solver if available Example: >>> # xdoc: +REQUIRES(module:pulp) >>> candidate_sets_dict = {} >>> exact_soln = _setcover_ilp(candidate_sets_dict) >>> print('exact_soln = {}'.format(ub.urepr(exact_soln, nl=0))) exact_soln = {} Example: >>> # xdoc: +REQUIRES(module:pulp) >>> candidate_sets_dict = { >>> 'a': [1, 2, 3, 8, 9, 0], >>> 'b': [1, 2, 3, 4, 5], >>> 'c': [4, 5, 7], >>> 'd': [5, 6, 7], >>> 'e': [6, 7, 8, 9, 0], >>> } >>> items = list(set(it.chain(*candidate_sets_dict.values()))) >>> set_weights = {i: 1 for i in candidate_sets_dict.keys()} >>> item_values = {e: 1 for e in items} >>> exact_soln1 = _setcover_ilp(candidate_sets_dict, >>> item_values=item_values, >>> set_weights=set_weights) >>> exact_soln2 = _setcover_ilp(candidate_sets_dict) >>> assert exact_soln1 == exact_soln2 """ try: import pulp except ImportError: print('ERROR: must install pulp to use ILP setcover solver') raise if len(candidate_sets_dict) == 0: return {} if items is None: items = list(set(it.chain(*candidate_sets_dict.values()))) if item_values is None and set_weights is None and max_weight is None: # This is the most basic set cover problem # Formulate integer program prob = pulp.LpProblem("Set Cover", pulp.LpMinimize) # Solution variable indicates if set it chosen or not set_indices = candidate_sets_dict.keys() x = pulp.LpVariable.dicts(name='x', indexs=set_indices, lowBound=0, upBound=1, cat=pulp.LpInteger) # minimize the number of sets prob.objective = sum(x[i] for i in set_indices) # subject to for e in items: # each element is covered containing_sets = [i for i in set_indices if e in candidate_sets_dict[i]] prob.add(sum(x[i] for i in containing_sets) >= 1) # Solve using with solver like CPLEX, GLPK, or SCIP. #pulp.CPLEX().solve(prob) pulp.PULP_CBC_CMD().solve(prob) # Read solution solution_keys = [i for i in set_indices if x[i].varValue] solution_cover = {i: candidate_sets_dict[i] for i in solution_keys} # Print summary if verbose: print(prob) print('OPT:') print('\n'.join([' %s = %s' % (x[i].name, x[i].varValue) for i in set_indices])) print('solution_cover = %r' % (solution_cover,)) else: if set_weights is None: set_weights = {i: 1 for i in candidate_sets_dict.keys()} if item_values is None: item_values = {e: 1 for e in items} if max_weight is None: max_weight = sum(set_weights[i] for i in candidate_sets_dict.keys()) prob = pulp.LpProblem("Maximum Cover", pulp.LpMaximize) # Solution variable indicates if set it chosen or not item_indicies = items set_indices = candidate_sets_dict.keys() x = pulp.LpVariable.dicts(name='x', indexs=set_indices, lowBound=0, upBound=1, cat=pulp.LpInteger) y = pulp.LpVariable.dicts(name='y', indexs=item_indicies, lowBound=0, upBound=1, cat=pulp.LpInteger) r = pulp.LpVariable.dicts(name='r', indexs=item_indicies) # maximize the value of the covered items primary_objective = sum(item_values[e] * y[e] for e in item_indicies) # minimize the number of sets used (make sure it does not influence the chosen primary objective) # This is only possible when values are non-negative # TODO: minimize redundency min_influence = min(item_values.values()) secondary_weight = min_influence / (1.1 * len(set_indices)) secondary_objective = (sum(-x[i] for i in set_indices)) * secondary_weight # prob.objective = primary_objective + secondary_objective # subject to # no more than the maximum weight prob.add(sum(x[i] * set_weights[i] for i in set_indices) <= max_weight) # If an item is chosen than at least one set containing it is chosen for e in item_indicies: containing_sets = [i for i in set_indices if e in candidate_sets_dict[i]] if len(containing_sets) > 0: prob.add(sum(x[i] for i in containing_sets) >= y[e]) # record number of times each item is covered prob.add(sum(x[i] for i in containing_sets) == r[e]) # Solve using with solver like CPLEX, GLPK, or SCIP. #pulp.CPLEX().solve(prob) pulp.PULP_CBC_CMD().solve(prob) # Read solution solution_keys = [i for i in set_indices if x[i].varValue] solution_cover = {i: candidate_sets_dict[i] for i in solution_keys} # Print summary if verbose: print(prob) print('OPT:') print('\n'.join([' %s = %s' % (x[i].name, x[i].varValue) for i in set_indices])) print('\n'.join([' %s = %s' % (y[i].name, y[i].varValue) for i in item_indicies])) print('solution_cover = %r' % (solution_cover,)) return solution_cover