@misc{zhou2025payattentionsmallweights,title={Pay Attention to Small Weights},author={Zhou, Chao and Jacobs, Tom and Gadhikar, Advait and Burkholz, Rebekka},year={2025},eprint={2506.21374},archiveprefix={arXiv},primaryclass={cs.LG},url={https://arxiv.org/abs/2506.21374}}
@misc{jacobs2025hamhyperbolicstepregulate,title={HAM: A Hyperbolic Step to Regulate Implicit Bias},author={Jacobs, Tom and Gadhikar, Advait and Rubio-Madrigal, Celia and Burkholz, Rebekka},year={2025},eprint={2506.02630},archiveprefix={arXiv},primaryclass={cs.LG},url={https://arxiv.org/abs/2506.02630}}
@misc{reddy2025shifthappensconfounding,title={When Shift Happens - Confounding Is to Blame},author={Reddy, Abbavaram Gowtham and Rubio-Madrigal, Celia and Burkholz, Rebekka and Muandet, Krikamol},year={2025},eprint={2505.21422},archiveprefix={arXiv},primaryclass={cs.LG},url={https://arxiv.org/abs/2505.21422}}
@misc{gadhikar2025signinlotteryreparameterizingsparse,title={Sign-In to the Lottery: Reparameterizing Sparse Training From Scratch},author={Gadhikar, Advait and Jacobs, Tom and Zhou, Chao and Burkholz, Rebekka},year={2025},eprint={2504.12801},archiveprefix={arXiv},primaryclass={cs.LG},url={https://arxiv.org/abs/2504.12801}}
@misc{gadhikar2024cyclicsparsetrainingenough,title={Cyclic Sparse Training: Is it Enough?},author={Gadhikar, Advait and Nelaturu, Sree Harsha and Burkholz, Rebekka},year={2024},eprint={2406.02773},archiveprefix={arXiv},primaryclass={cs.LG},url={https://arxiv.org/abs/2406.02773},}
@misc{fischer2022lotteryticketsnonzerobiases,title={Lottery Tickets with Nonzero Biases},author={Fischer, Jonas and Gadhikar, Advait and Burkholz, Rebekka},year={2022},eprint={2110.11150},archiveprefix={arXiv},primaryclass={cs.LG},url={https://arxiv.org/abs/2110.11150},}
Implicit bias plays an important role in explaining how overparameterized models generalize well. Explicit regularization like weight decay is often employed in addition to prevent overfitting. While both concepts have been studied separately, in practice, they often act in tandem. Understanding their interplay is key to controlling the shape and strength of implicit bias, as it can be modified by explicit regularization. To this end, we incorporate explicit regularization into the mirror flow framework and analyze its lasting effects on the geometry of the training dynamics, covering three distinct effects: positional bias, type of bias, and range shrinking. Our analytical approach encompasses a broad class of problems, including sparse coding, matrix sensing, single-layer attention, and LoRA, for which we demonstrate the utility of our insights. To exploit the lasting effect of regularization and highlight the potential benefit of dynamic weight decay schedules, we propose to switch off weight decay during training, which can improve generalization, as we demonstrate in experiments.
@inproceedings{jacobs2025mirror,title={Mirror, Mirror of the Flow: How Does Regularization Shape Implicit Bias?},author={Jacobs, Tom and Zhou, Chao and Burkholz, Rebekka},booktitle={Forty-second International Conference on Machine Learning},year={2025},url={https://arxiv.org/abs/2504.12883},}
Continuous sparsification strategies are among the most effective methods for reducing the inference costs and memory demands of large-scale neural networks. A key factor in their success is the implicit L1 regularization induced by jointly learning both mask and weight variables, which has been shown experimentally to outperform explicit L1 regularization. We provide a theoretical explanation for this observation by analyzing the learning dynamics, revealing that early continuous sparsification is governed by an implicit L2 regularization that gradually transitions to an L1 penalty over time. Leveraging this insight, we propose a method to dynamically control the strength of this implicit bias. Through an extension of the mirror flow framework, we establish convergence and optimality guarantees in the context of underdetermined linear regression. Our theoretical findings may be of independent interest, as we demonstrate how to enter the rich regime and show that the implicit bias can be controlled via a time-dependent Bregman potential. To validate these insights, we introduce PILoT, a continuous sparsification approach with novel initialization and dynamic regularization, which consistently outperforms baselines in standard experiments.
@inproceedings{jacobs2025mask,title={Mask in the Mirror: Implicit Sparsification},author={Jacobs, Tom and Burkholz, Rebekka},booktitle={The Thirteenth International Conference on Learning Representations},year={2025},url={https://openreview.net/forum?id=U47ymTS3ut},}
Maximizing the spectral gap through graph rewiring has been proposed to enhance the performance of message-passing graph neural networks (GNNs) by addressing over-squashing. However, as we show, minimizing the spectral gap can also improve generalization. To explain this, we analyze how rewiring can benefit GNNs within the context of stochastic block models. Since spectral gap optimization primarily influences community strength, it improves performance when the community structure aligns with node labels. Building on this insight, we propose three distinct rewiring strategies that explicitly target community structure, node labels, and their alignment: (a) community structure-based rewiring (ComMa), a more computationally efficient alternative to spectral gap optimization that achieves similar goals; (b) feature similarity-based rewiring (FeaSt), which focuses on maximizing global homophily; and (c) a hybrid approach (ComFy), which enhances local feature similarity while preserving community structure to optimize label-community alignment. Extensive experiments confirm the effectiveness of these strategies and support our theoretical insights.
@inproceedings{rubio-madrigal2025gnns,title={{GNN}s Getting ComFy: Community and Feature Similarity Guided Rewiring},author={Rubio-Madrigal, Celia and Jamadandi, Adarsh and Burkholz, Rebekka},booktitle={The Thirteenth International Conference on Learning Representations},year={2025},url={https://openreview.net/forum?id=g6v09VxgFw},}
Message Passing Graph Neural Networks are known to suffer from two problems that are sometimes believed to be diametrically opposed: over-squashing and over-smoothing. The former results from topological bottlenecks that hamper the information flow from distant nodes and are mitigated by spectral gap maximization, primarily, by means of edge additions. However, such additions often promote over-smoothing that renders nodes of different classes less distinguishable. Inspired by the Braess phenomenon, we argue that deleting edges can address over-squashing and over-smoothing simultaneously. This insight explains how edge deletions can improve generalization, thus connecting spectral gap optimization to a seemingly disconnected objective of reducing computational resources by pruning graphs for lottery tickets. To this end, we propose a computationally effective spectral gap optimization framework to add or delete edges and demonstrate its effectiveness on the long range graph benchmark and on larger heterophilous datasets.
@inproceedings{jamadandi2024spectral,title={Spectral Graph Pruning Against Over-Squashing and Over-Smoothing},author={Jamadandi, Adarsh and Rubio-Madrigal, Celia and Burkholz, Rebekka},booktitle={Thirty-eighth Conference on Neural Information Processing Systems},year={2024},url={https://openreview.net/forum?id=EMkrwJY2de},}
Graph neural networks (GNNs) with a rescale invariance, such as GATs, can be re-parameterized during optimization through dynamic rescaling of network parameters and gradients while keeping the loss invariant. In this work, we explore dynamic rescaling as a tool to influence GNN training dynamics in two key ways: i) balancing the network with respect to various criteria, and ii) controlling the relative learning speeds of different layers. We gain novel insights, unique to GNNs, that reveal distinct training modes for different tasks. For heterophilic graphs, achieving balance based on relative gradients leads to faster training and better generalization. In contrast, homophilic graphs benefit from delaying the learning of later layers. Additionally, we show that training in balance supports larger learning rates, which can improve generalization. Moreover, controlling layer-wise training speeds is linked to grokking-like phenomena, which may be of independent interest.
@inproceedings{mustafa2024dynamic,title={Dynamic Rescaling for Training {GNN}s},author={Mustafa, Nimrah and Burkholz, Rebekka},booktitle={Thirty-eighth Annual Conference on Neural Information Processing Systems},year={2024},url={https://openreview.net/forum?id=IfZwSRpqHl},}
The practical utility of machine learning models in the sciences often hinges on their interpretability. It is common to assess a model’s merit for scientific discovery, and thus novel insights, by how well it aligns with already available domain knowledge - a dimension that is currently largely disregarded in the comparison of neural network models. While pruning can simplify deep neural network architectures and excels in identifying sparse models, as we show in the context of gene regulatory network inference, state-of-the-art techniques struggle with biologically meaningful structure learning. To address this issue, we propose DASH, a generalizable framework that guides network pruning by using domain-specific structural information in model fitting and leads to sparser, better interpretable models that are more robust to noise. Using both synthetic data with ground truth information, as well as real-world gene expression data, we show that DASH, using knowledge about gene interaction partners within the putative regulatory network, outperforms general pruning methods by a large margin and yields deeper insights into the biological systems being studied.
@inproceedings{hossain2024pruning,title={Pruning neural network models for gene regulatory dynamics using data and domain knowledge},author={Hossain, Intekhab and Fischer, Jonas and Burkholz, Rebekka and Quackenbush, John},booktitle={Thirty-eighth Conference on Neural Information Processing Systems},year={2024},url={https://openreview.net/forum?id=FNtsZLwkGr},}
Gene regulatory network (GRN) models that are formulated as ordinary differential equations (ODEs) can accurately explain temporal gene expression patterns and promise to yield new insights into important cellular processes, disease progression, and intervention design. Learning such gene regulatory ODEs is challenging, since we want to predict the evolution of gene expression in a way that accurately encodes the underlying GRN governing the dynamics and the nonlinear functional relationships between genes. Most widely used ODE estimation methods either impose too many parametric restrictions or are not guided by meaningful biological insights, both of which impede either scalability, explainability, or both.
@article{hossain2024biologically,author={Hossain, Intekhab and Fanfani, Viola and Fischer, Jonas and Quackenbush, John and Burkholz, Rebekka},title={Biologically informed NeuralODEs for genome-wide regulatory dynamics},journal={Genome Biology},year={2024},month=may,volume={25},url={https://doi.org/10.1186/s13059-024-03264-0},}
Shiwei Liu, Kai Han, Adriana Fernandez-Lopez, Ajay Kumar Jaiswal, Zahra Atashgahi, Boqian Wu, Edoardo Ponti, Callie Hao, Rebekka Burkholz, Olga Saukh, Lu Yin, Andreas Zinonos, Tianjin Huang, Jared Tanner, and Yunhe Wang
@inproceedings{liu2024edgellms,title={Edge-{LLM}s: Edge-Device Large Language Model Competition},author={Liu, Shiwei and Han, Kai and Fernandez-Lopez, Adriana and Jaiswal, Ajay Kumar and Atashgahi, Zahra and Wu, Boqian and Ponti, Edoardo and Hao, Callie and Burkholz, Rebekka and Saukh, Olga and Yin, Lu and Zinonos, Andreas and Huang, Tianjin and Tanner, Jared and Wang, Yunhe},booktitle={NeurIPS 2024 Competition Track},year={2024},url={https://openreview.net/forum?id=jeCMRoIn15}}
Graph Attention Networks (GATs) are designed to provide flexible neighborhood aggregation that assigns weights to neighbors according to their importance. In practice, however, GATs are often unable to switch off task-irrelevant neighborhood aggregation, as we show experimentally and analytically. To address this challenge, we propose GATE, a GAT extension that holds three major advantages: i) It alleviates over-smoothing by addressing its root cause of unnecessary neighborhood aggregation. ii) Similarly to perceptrons, it benefits from higher depth as it can still utilize additional layers for (non-)linear feature transformations in case of (nearly) switched-off neighborhood aggregation. iii) By down-weighting connections to unrelated neighbors, it often outperforms GATs on real-world heterophilic datasets. To further validate our claims, we construct a synthetic test bed to analyze a model’s ability to utilize the appropriate amount of neighborhood aggregation, which could be of independent interest.
@inproceedings{mustafa2024gate,title={{GATE}: How to Keep Out Intrusive Neighbors},author={Mustafa, Nimrah and Burkholz, Rebekka},booktitle={Forty-first International Conference on Machine Learning},year={2024},url={https://openreview.net/forum?id=Sjv5RcqfuH},}
Learning Rate Rewinding (LRR) has been established as a strong variant of Iterative Magnitude Pruning (IMP) to find lottery tickets in deep overparameterized neural networks. While both iterative pruning schemes couple structure and parameter learning, understanding how LRR excels in both aspects can bring us closer to the design of more flexible deep learning algorithms that can optimize diverse sets of sparse architectures. To this end, we conduct experiments that disentangle the effect of mask learning and parameter optimization and how both benefit from overparameterization. The ability of LRR to flip parameter signs early and stay robust to sign perturbations seems to make it not only more effective in mask identification but also in optimizing diverse sets of masks, including random ones. In support of this hypothesis, we prove in a simplified single hidden neuron setting that LRR succeeds in more cases than IMP, as it can escape initially problematic sign configurations.
@inproceedings{gadhikar2024masks,title={Masks, Signs, And Learning Rate Rewinding},author={Gadhikar, Advait Harshal and Burkholz, Rebekka},booktitle={The Twelfth International Conference on Learning Representations},year={2024},url={https://openreview.net/forum?id=qODvxQ8TXW},}
Layer normalization, for which Batch Normalization (BN) is a popular choice, is an integral part of many deep learning architectures and contributes significantly to the learning success. We provide a partial explanation for this phenomenon by proving that training normalization layers alone is already sufficient for universal function approximation if the number of available, potentially random features matches or exceeds the weight parameters of the target networks that can be expressed. Our bound on the number of required features does not only improve on a recent result for fully-connected feed-forward architectures but also applies to CNNs with and without residual connections and almost arbitrary activation functions (which include ReLUs). Our explicit construction of a given target network solves a depth-width trade-off that is driven by architectural constraints and can explain why switching off entire neurons can have representational benefits, as has been observed empirically. To validate our theory, we explicitly match target networks that outperform experimentally obtained networks with trained BN parameters by utilizing a sufficient number of random features.
@inproceedings{burkholz2024batch,title={Batch normalization is sufficient for universal function approximation in {CNN}s},author={Burkholz, Rebekka},booktitle={The Twelfth International Conference on Learning Representations},year={2024},url={https://openreview.net/forum?id=wOSYMHfENq},}
While the expressive power and computational capabilities of graph neural networks (GNNs) have been theoretically studied, their optimization and learning dynamics, in general, remain largely unexplored. Our study undertakes the Graph Attention Network (GAT), a popular GNN architecture in which a node’s neighborhood aggregation is weighted by parameterized attention coefficients. We derive a conservation law of GAT gradient flow dynamics, which explains why a high portion of parameters in GATs with standard initialization struggle to change during training. This effect is amplified in deeper GATs, which perform significantly worse than their shallow counterparts. To alleviate this problem, we devise an initialization scheme that balances the GAT network. Our approach i) allows more effective propagation of gradients and in turn enables trainability of deeper networks, and ii) attains a considerable speedup in training and convergence time in comparison to the standard initialization. Our main theorem serves as a stepping stone to studying the learning dynamics of positive homogeneous models with attention mechanisms.
@inproceedings{mustafa2023are,title={Are {GAT}s Out of Balance?},author={Mustafa, Nimrah and Bojchevski, Aleksandar and Burkholz, Rebekka},booktitle={Thirty-seventh Conference on Neural Information Processing Systems},year={2023},url={https://openreview.net/forum?id=qY7UqLoora},}
Random masks define surprisingly effective sparse neural network models, as has been shown empirically. The resulting sparse networks can often compete with dense architectures and state-of-the-art lottery ticket pruning algorithms, even though they do not rely on computationally expensive prune-train iterations and can be drawn initially without significant computational overhead. We offer a theoretical explanation of how random masks can approximate arbitrary target networks if they are wider by a logarithmic factor in the inverse sparsity 1 / \log(1/\textsparsity). This overparameterization factor is necessary at least for 3-layer random networks, which elucidates the observed degrading performance of random networks at higher sparsity. At moderate to high sparsity levels, however, our results imply that sparser networks are contained within random source networks so that any dense-to-sparse training scheme can be turned into a computationally more efficient sparse-to-sparse one by constraining the search to a fixed random mask. We demonstrate the feasibility of this approach in experiments for different pruning methods and propose particularly effective choices of initial layer-wise sparsity ratios of the random source network. As a special case, we show theoretically and experimentally that random source networks also contain strong lottery tickets.
@inproceedings{pmlr-v202-gadhikar23a,title={Why Random Pruning Is All We Need to Start Sparse},author={Gadhikar, Advait Harshal and Mukherjee, Sohom and Burkholz, Rebekka},booktitle={Proceedings of the 40th International Conference on Machine Learning},pages={10542--10570},year={2023},editor={Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},volume={202},series={Proceedings of Machine Learning Research},publisher={PMLR},url={https://proceedings.mlr.press/v202/gadhikar23a.html},}
Marouen Ben Guebila, Tian Wang, Camila M. Lopes-Ramos, Viola Fanfani, Des Weighill, Rebekka Burkholz, Daniel Schlauch, Joseph N. Paulson, Michael Altenbuchinger, Katherine H. Shutta, Abhijeet R. Sonawane, James Lim, Genis Calderer, David G.P. IJzendoorn, Daniel Morgan, Alessandro Marin, Cho-Yi Chen, Qi Song, Enakshi Saha, Dawn L. DeMeo, Megha Padi, John Platig, Marieke L. Kuijjer, Kimberly Glass, and John Quackenbush
Inference and analysis of gene regulatory networks (GRNs) require software that integrates multi-omic data from various sources. The Network Zoo (netZoo; netzoo.github.io) is a collection of open-source methods to infer GRNs, conduct differential network analyses, estimate community structure, and explore the transitions between biological states. The netZoo builds on our ongoing development of network methods, harmonizing the implementations in various computing languages and between methods to allow better integration of these tools into analytical pipelines. We demonstrate the utility using multi-omic data from the Cancer Cell Line Encyclopedia. We will continue to expand the netZoo to incorporate additional methods.
@article{BenGuebila2023,author={Ben Guebila, Marouen and Wang, Tian and Lopes-Ramos, Camila M. and Fanfani, Viola and Weighill, Des and Burkholz, Rebekka and Schlauch, Daniel and Paulson, Joseph N. and Altenbuchinger, Michael and Shutta, Katherine H. and Sonawane, Abhijeet R. and Lim, James and Calderer, Genis and van IJzendoorn, David G.P. and Morgan, Daniel and Marin, Alessandro and Chen, Cho-Yi and Song, Qi and Saha, Enakshi and DeMeo, Dawn L. and Padi, Megha and Platig, John and Kuijjer, Marieke L. and Glass, Kimberly and Quackenbush, John},title={The Network Zoo: a multilingual package for the inference and analysis of gene regulatory networks},journal={Genome Biology},year={2023},day={09},volume={24},number={1},pages={45},issn={1474-760X},doi={10.1186/s13059-023-02877-1},url={https://doi.org/10.1186/s13059-023-02877-1},}
Katherine H Shutta, Deborah Weighill, Rebekka Burkholz, Marouen Ben Guebila, Dawn L DeMeo, Helena U Zacharias, John Quackenbush, and Michael Altenbuchinger
The increasing quantity of multi-omic data, such as methylomic and transcriptomic profiles collected on the same specimen or even on the same cell, provides a unique opportunity to explore the complex interactions that define cell phenotype and govern cellular responses to perturbations. We propose a network approach based on Gaussian Graphical Models (GGMs) that facilitates the joint analysis of paired omics data. This method, called DRAGON (Determining Regulatory Associations using Graphical models on multi-Omic Networks), calibrates its parameters to achieve an optimal trade-off between the network’s complexity and estimation accuracy, while explicitly accounting for the characteristics of each of the assessed omics ‘layers.’ In simulation studies, we show that DRAGON adapts to edge density and feature size differences between omics layers, improving model inference and edge recovery compared to state-of-the-art methods. We further demonstrate in an analysis of joint transcriptome - methylome data from TCGA breast cancer specimens that DRAGON can identify key molecular mechanisms such as gene regulation via promoter methylation. In particular, we identify Transcription Factor AP-2 Beta (TFAP2B) as a potential multi-omic biomarker for basal-type breast cancer. DRAGON is available as open-source code in Python through the Network Zoo package (netZooPy v0.8; netzoo.github.io).
@article{10.1093/nar/gkac1157,author={Shutta, Katherine H and Weighill, Deborah and Burkholz, Rebekka and Guebila, Marouen Ben and DeMeo, Dawn L and Zacharias, Helena U and Quackenbush, John and Altenbuchinger, Michael},title={DRAGON: Determining Regulatory Associations using Graphical models on multi-Omic Networks},journal={Nucleic Acids Research},volume={51},number={3},pages={e15-e15},year={2022},issn={0305-1048},doi={10.1093/nar/gkac1157},url={https://doi.org/10.1093/nar/gkac1157},eprint={https://academic.oup.com/nar/article-pdf/51/3/e15/49192710/gkac1157.pdf},}
The strong lottery ticket hypothesis has highlighted the potential for training deep neural networks by pruning, which has inspired interesting practical and theoretical insights into how neural networks can represent functions. For networks with ReLU activation functions, it has been proven that a target network with depth L can be approximated by the subnetwork of a randomly initialized neural network that has double the target’s depth 2L and is wider by a logarithmic factor. We show that a depth L+1 is sufficient. This result indicates that we can expect to find lottery tickets at realistic, commonly used depths while only requiring logarithmic overparametrization. Our novel construction approach applies to a large class of activation functions and is not limited to ReLUs. Code is available on Github (RelationalML/LT-existence).
@inproceedings{NEURIPS2022_76bf7786,author={Burkholz, Rebekka},booktitle={Advances in Neural Information Processing Systems},editor={Koyejo, S. and Mohamed, S. and Agarwal, A. and Belgrave, D. and Cho, K. and Oh, A.},pages={18707--18720},publisher={Curran Associates, Inc.},title={Most Activation Functions Can Win the Lottery Without Excessive Depth},url={https://papers.nips.cc/paper_files/paper/2022/hash/76bf7786d311217077bc8bb021946cd9-Abstract-Conference.html},volume={35},year={2022},}
The Lottery Ticket Hypothesis continues to have a profound practical impact on the quest for small scale deep neural networks that solve modern deep learning tasks at competitive performance. These lottery tickets are identified by pruning large randomly initialized neural networks with architectures that are as diverse as their applications. Yet, theoretical insights that attest their existence have been mostly focused on deed fully-connected feed forward networks with ReLU activation functions. We prove that also modern architectures consisting of convolutional and residual layers that can be equipped with almost arbitrary activation functions can contain lottery tickets with high probability.
@inproceedings{pmlr-v162-burkholz22a,title={Convolutional and Residual Networks Provably Contain Lottery Tickets},author={Burkholz, Rebekka},booktitle={Proceedings of the 39th International Conference on Machine Learning},pages={2414--2433},year={2022},editor={Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},volume={162},series={Proceedings of Machine Learning Research},publisher={PMLR},url={https://proceedings.mlr.press/v162/burkholz22a.html},}
Marouen Ben Guebila, Deborah Weighill, Camila M. Lopes-Ramos, Rebekka Burkholz, Romana T. Pop, Kalyan Palepu, Mia Shapoval, Maud Fagny, Daniel Schlauch, Kimberly Glass, Michael Altenbuchinger, Marieke L. Kuijjer, John Platig, and John Quackenbush
Open access to software in computational and systems biology, including data, code and models, is widely acknowledged as essential for ensuring reproducibility of research results and reuse of methods1. Although there are software tools that allow sharing of computational pipelines, these systems generally do not allow the integration of software annotation and documentation at each step in the process — elements that are required to understand and run complex and rapidly evolving software, including methods developed in systems biology for inferring biological pathways. Our research team has been developing network inference and analysis methods, collected into the Network Zoo (http://netzoo.github.io), with implementations in R, C, MATLAB and Python. The growing community of users of these network resources, the increasing interest in learning how to apply network inference methods, and the need to ensure that published analyses are fully reproducible led us to develop Netbooks (http://netbooks.networkmedicine.org), a hosted collection of Jupyter notebooks that provide detailed and annotated step-by-step case studies of GRN analysis.
@article{BenGuebila2022,author={Ben Guebila, Marouen and Weighill, Deborah and Lopes-Ramos, Camila M. and Burkholz, Rebekka and Pop, Romana T. and Palepu, Kalyan and Shapoval, Mia and Fagny, Maud and Schlauch, Daniel and Glass, Kimberly and Altenbuchinger, Michael and Kuijjer, Marieke L. and Platig, John and Quackenbush, John},title={An online notebook resource for reproducible inference, analysis and publication of gene regulatory networks},journal={Nature Methods},year={2022},day={01},volume={19},number={5},pages={511-513},issn={1548-7105},doi={10.1038/s41592-022-01479-2},url={https://doi.org/10.1038/s41592-022-01479-2},}
The lottery ticket hypothesis has sparked the rapid development of pruning algorithms that aim to reduce the computational costs associated with deep learning during training and model deployment. Currently, such algorithms are primarily evaluated on imaging data, for which we lack ground truth information and thus the understanding of how sparse lottery tickets could be. To fill this gap, we develop a framework that allows us to plant and hide winning tickets with desirable properties in randomly initialized neural networks. To analyze the ability of state-of-the-art pruning to identify tickets of extreme sparsity, we design and hide such tickets solving four challenging tasks. In extensive experiments, we observe similar trends as in imaging studies, indicating that our framework can provide transferable insights into realistic problems. Additionally, we can now see beyond such relative trends and highlight limitations of current pruning methods. Based on our results, we conclude that the current limitations in ticket sparsity are likely of algorithmic rather than fundamental nature. We anticipate that comparisons to planted tickets will facilitate future developments of efficient pruning algorithms.
@inproceedings{fischer2022plant,title={Plant 'n' Seek: Can You Find the Winning Ticket?},author={Fischer, Jonas and Burkholz, Rebekka},booktitle={The Tenth International Conference on Learning Representations},year={2022},url={https://openreview.net/forum?id=9n9c8sf0xm},}
The lottery ticket hypothesis conjectures the existence of sparse subnetworks of large randomly initialized deep neural networks that can be successfully trained in isolation. Recent work has experimentally observed that some of these tickets can be practically reused across a variety of tasks, hinting at some form of universality. We formalize this concept and theoretically prove that not only do such universal tickets exist but they also do not require further training. Our proofs introduce a couple of technical innovations related to pruning for strong lottery tickets, including extensions of subset sum results and a strategy to leverage higher amounts of depth. Our explicit sparse constructions of universal function families might be of independent interest, as they highlight representational benefits induced by univariate convolutional architectures.
@inproceedings{burkholz2022on,title={On the Existence of Universal Lottery Tickets},author={Burkholz, Rebekka and Laha, Nilanjana and Mukherjee, Rajarshi and Gotovos, Alkis},booktitle={The Tenth International Conference on Learning Representations},year={2022},url={https://openreview.net/forum?id=SYB4WrJql1n},}