[2]:
import numpy as np
import time
import pickle
from chipsplitting import PascalForm, HyperfieldHomogeneousLinearSystem as HVLinearSystem
Note: This notebook uses Cython. Compile the code by running ``python setup.py build_ext –inplace`` in your terminal.
Support Candidates
To search for fundamental statistical models in \(\Delta_n\), we consider the equivalent problem of finding fundamental chipsplitting models with a positive support size of \(n+1\). A naive approach would be to enumerate all supports and, for each, check if it constitutes a fundamental chipsplitting model. This would result in \(\binom{(d+1)(d+2)/2}{n+1}\) computations.
Optimized Search Using Remark 6.11 of [BM25]
We reduce this search space using Remark 6.11 of [BM25]. The initial implementation of this algorithm was provided by Bik and Marigliano on Mathrepo.
We provide a Cython version of the same algorithm (see Step 1, Step 2, and Step 3 [BM25_MATHREPO]) to speed up computations.
[5]:
# C++ algorithm of Bik and Marigliano
def find_positive_supports(pos_support_size, d):
base_types = ["diag", "row", "col"]
A = [PascalForm(d, b, k).to_hyperfield() for b in base_types for k in range(d + 1)]
return HVLinearSystem(A).quick_solve_loop_fast(pos_support_size)
Computations \(\Delta_7\)
[3]:
to_compute = [(7,7), (7,8), (7,9), (7,10), (7,11), (7,12), (7,13)]
print("Starting computation...")
print("-" * 60)
total_start_time = time.perf_counter()
for n, d in to_compute:
iter_start_time = time.perf_counter()
pos_support_size = n + 1
possible_supports = find_positive_supports(pos_support_size, d)
iter_elapsed_time = time.perf_counter() - iter_start_time
print(f"n={n:<2}, d={d:<2} | Supports found: {len(possible_supports):<7} | Time: {iter_elapsed_time:>5.2f}s")
total_elapsed_time = time.perf_counter() - total_start_time
print("-" * 60)
print(f"Total computation time: {total_elapsed_time:.2f}s")
Starting computation...
------------------------------------------------------------
n=7 , d=7 | Supports found: 76922 | Time: 0.19s
n=7 , d=8 | Supports found: 436896 | Time: 1.06s
n=7 , d=9 | Supports found: 1927201 | Time: 5.13s
n=7 , d=10 | Supports found: 6508380 | Time: 19.66s
n=7 , d=11 | Supports found: 18676991 | Time: 60.78s
n=7 , d=12 | Supports found: 47682475 | Time: 163.59s
n=7 , d=13 | Supports found: 107547676 | Time: 403.89s
------------------------------------------------------------
Total computation time: 654.30s
Computations \(\Delta_8\)
[6]:
to_compute = [(8,8), (8,9), (8,10), (8,11)]
print("Starting computation...")
print("-" * 60)
total_start_time = time.perf_counter()
for n, d in to_compute:
iter_start_time = time.perf_counter()
pos_support_size = n + 1
possible_supports = find_positive_supports(pos_support_size, d)
iter_elapsed_time = time.perf_counter() - iter_start_time
print(f"n={n:<2}, d={d:<2} | Supports found: {len(possible_supports):<7} | Time: {iter_elapsed_time:>5.2f}s")
total_elapsed_time = time.perf_counter() - total_start_time
print("-" * 60)
print(f"Total computation time: {total_elapsed_time:.2f}s")
Starting computation...
------------------------------------------------------------
n=8 , d=8 | Supports found: 622003 | Time: 11.65s
n=8 , d=9 | Supports found: 4470103 | Time: 12.49s
n=8 , d=10 | Supports found: 21517185 | Time: 67.20s
n=8 , d=11 | Supports found: 88151373 | Time: 291.52s
------------------------------------------------------------
Total computation time: 382.86s
Optional Optimization: Remove Non-Minimal Supports
The previous step, find_positive_supports(n+1, d), generated candidate positive supports \(S \subset {V_d}\) for fundamental models. For example, with \(n=8, d=11\), 622,003 supports were returned.
These returned supports may have a size less than \(n+1\). For instance, \(S = \{(1, 1),(0, 3),(3, 0),(3, 7),(2, 9),(7, 4),(8, 3)\}\) is a candidate support.
If our target size is \(n+1\), we would need to extend it by adding \((n+1 - |S|)\) many tuples \((i,j) \in {V_d}\). This necessitates checking \(\binom{(d+1)(d+2)/2 - |S|}{n+1 - |S|}\) combinations for a given candidate \(S\). To optimize, we are only interested in minimal support sets.
Optimization: If \(S' \supset S\), then \(S'\) can be excluded, as \(S\) is already a more concise candidate.
[3]:
import multiprocess
from itertools import repeat
def find_length_change_indices(sorted_data):
"""
Detects the changes in the length of the items in a sorted list of sets
and returns the indices that define batches of same-length items.
Args:
sorted_data: A list of sets, sorted by length in ascending order.
Returns:
A list of integers representing the starting indices of each batch
of same-length sets, plus the total length of the data at the end.
"""
if not sorted_data:
return [0]
indices = [0]
current_len = len(sorted_data[0])
for i, item in enumerate(sorted_data):
if len(item) != current_len:
indices.append(i)
current_len = len(item)
if indices[-1] != len(sorted_data):
indices.append(len(sorted_data))
return indices
# Example Usage of find_length_change_indices:
# sorted_data = [
# {'a'}, # Length 1
# {'b'}, # Length 1
# {'c', 'd'}, # Length 2
# {'e', 'f'}, # Length 2
# {'g', 'h', 'i'}, # Length 3
# {'j', 'k', 'l', 'm'} # Length 4
# ]
#
# result = find_length_change_indices(sorted_data)
# print(result) # Expected output: [0, 2, 4, 5, 6]
def _is_minimal_in_batch(s, current_minimal_sets):
"""
A worker function for parallel processing. It checks if a single set 's'
is a superset of any of the already confirmed minimal sets.
This function is executed by each process in the pool.
"""
for m in current_minimal_sets:
if m.issubset(s):
return None # It is a superset, therefore not minimal
return s # It is minimal with respect to the sets checked
def find_minimal_sets_parallel(initial_data, num_processes=None):
"""
Finds the minimal sets from a list of sets using parallel processing.
This function batches the input data by the length of the sets and
processes each batch of same-length sets in parallel.
Args:
initial_data: A list of lists, sets, or other iterables.
num_processes: The number of CPU processes to use.
Defaults to the number of cores on the machine.
Returns:
A set of frozensets, where each frozenset is a minimal set.
"""
processed_data = {frozenset(item) for item in initial_data}
sorted_data = sorted(list(processed_data), key=len)
if not sorted_data:
return set()
batch_indices = find_length_change_indices(sorted_data)
minimal_sets = set()
with multiprocess.Pool(processes=num_processes) as pool:
for i in range(len(batch_indices) - 1):
start_index = batch_indices[i]
end_index = batch_indices[i+1]
batch = sorted_data[start_index:end_index]
# The results from the map will be either the set itself (if minimal) or None
results = pool.starmap(
_is_minimal_in_batch,
zip(batch, repeat(minimal_sets))
)
# Collect the new minimal sets found in the current batch
new_minimal_in_this_batch = {res for res in results if res is not None}
minimal_sets.update(new_minimal_in_this_batch)
return minimal_sets
Computations \(\Delta_7\) (Minimal support)
We apply this optimization technique to \(n=7, d=7,8\) to reduce the search space further.
[4]:
to_compute = [(7,7), (7,8)]
for n, d in to_compute:
pos_support_size = n + 1
data = find_positive_supports(pos_support_size, d)
total_start_time = time.perf_counter()
min_data = find_minimal_sets_parallel(data)
iter_elapsed_time = time.perf_counter() - total_start_time
print(f"n={n:<2}, d={d:<2} | Mininmal supports found: {len(min_data):<7} | Time: {iter_elapsed_time:>5.2f}s", flush=True)
n=7 , d=7 | Mininmal supports found: 15779 | Time: 5.28s
n=7 , d=8 | Mininmal supports found: 64000 | Time: 63.35s
Computations \(\Delta_8\) (Minimal support)
We apply this optimization technique to \(n=8, d=8,9\) to reduce the search space further.
[7]:
to_compute = [(8,8), (8,9)]
for n, d in to_compute:
pos_support_size = n + 1
data = find_positive_supports(pos_support_size, d)
total_start_time = time.perf_counter()
min_data = find_minimal_sets_parallel(data)
iter_elapsed_time = time.perf_counter() - total_start_time
print(f"n={n:<2}, d={d:<2} | Mininmal supports found: {len(min_data):<7} | Time: {iter_elapsed_time:>5.2f}s", flush=True)
n=8 , d=8 | Mininmal supports found: 66830 | Time: 102.85s
n=8 , d=9 | Mininmal supports found: 462075 | Time: 3950.16s
Serializing candidate supports
To serialize the candidate supports for later computations, include the following code.
[ ]:
with open(f'data/n{n:02d}_d{d:02d}.pkl', 'wb') as f:
pickle.dump(possible_supports, f)
print(f"Serialized to {f.name}")
print("------------------------------------")
To deserialize the candidate supports, use
[ ]:
with open(f'data/n{n:02}_d{d:02}.pkl', 'rb') as f:
data = pickle.load(f)