Publications | Loïc Pottier

2022

Performance assessment of ensembles of in situ workflows under resource constraints

Tu Mai Anh Do, Loïc Pottier, Rafael Ferreira da Silva, Silvina Caı́no-Lores, Michela Taufer, and Ewa Deelman

Concurrency and Computation: Practice and Experience 2022

Bib HTML

@article{do-ccpe-2022,
  author = {Do, Tu Mai Anh and Pottier, Lo\"ic and {Ferreira da Silva}, Rafael and Ca\'{\i}no-Lores, Silvina and Taufer, Michela and Deelman, Ewa},
  title = {Performance assessment of ensembles of in situ workflows under resource constraints},
  journal = {Concurrency and Computation: Practice and Experience},
  year = {2022},
  doi = {10.1002/cpe.7111},
  keywords = {mine,notalpha,isi},
  note = {Funding Acknowledgments: NSF 1664162},
}

FGCS

WfCommons: A framework for enabling scientific workflow research and development

Tainã Coleman, Henri Casanova, Loïc Pottier, Manav Kaushik, Ewa Deelman, and Rafael Ferreira da Silva

Future Generation Computer Systems 2022

Bib HTML

@article{coleman-fgcs-2022,
  title = {WfCommons: A framework for enabling scientific workflow research and development},
  journal = {Future Generation Computer Systems},
  volume = {128},
  pages = {16-27},
  year = {2022},
  issn = {0167-739X},
  doi = {10.1016/j.future.2021.09.043},
  author = {Coleman, Tain\~a and Casanova, Henri and Pottier, Lo{\"i}c and Kaushik, Manav and Deelman, Ewa and {Ferreira da Silva}, Rafael},
  keywords = {mine,isi},
  note = {Funding Acknowledgments: NSF 1923539},
}

JSSPP

On the Feasibility of Simulation-driven Portfolio Scheduling for Cyberinfrastructure Runtime Systems

Henri Casanova, Yick Ching Wong, Loïc Pottier, and Rafael Ferreira da Silva

In Job Scheduling Strategies for Parallel Processing (JSSPP) 2022

Bib

@inproceedings{casanova-jssp-2022,
  title = {On the Feasibility of Simulation-driven Portfolio Scheduling for Cyberinfrastructure Runtime Systems},
  author = {Casanova, Henri and Ching Wong, Yick and Pottier, Lo\"ic and {Ferreira da Silva}, Rafael},
  booktitle = {Job Scheduling Strategies for Parallel Processing (JSSPP)},
  doi = {},
  pages = {To appear},
  year = {2022},
  publisher = {Springer Nature},
  note = {Funding Acknowledgments: NSF 2106059 and 2106147, DOE DE-AC05-00OR22725},
  keywords = {mine,notalpha,isi},
}

CCGrid

Accelerating Scientific Workflows on HPC Platforms with In Situ Processing

Tu Mai Anh Do, Loïc Pottier, Orcun Yildiz, Karan Vahi, Patrycja Krawczuk, Tom Peterka, and 1 more author

In IEEE/ACM 22nd International Symposium on Cluster, Cloud and Internet Computing (CCGrid) 2022

Bib

@inproceedings{pottier-ccgrid-2022,
  title = {Accelerating Scientific Workflows on HPC Platforms with In Situ Processing},
  author = {Do, Tu Mai Anh and Pottier, Lo\"ic and Yildiz, Orcun and Vahi, Karan and Krawczuk, Patrycja and Peterka, Tom and Deelman, Ewa},
  booktitle = {IEEE/ACM 22nd International Symposium on Cluster, Cloud and Internet Computing (CCGrid)},
  author+an = {1=jointfirst;2=jointfirst},
  doi = {10.1109/CCGrid54584.2022.00009},
  pages = {1--10},
  year = {2022},
  organization = {IEEE},
  note = {Funding Acknowledgments: NSF 1664162, DOE DE-AC02-06CH11357, DE-AC02-05CH11231, DE-SC0012636 and DE-SC0022328},
  keywords = {mine,notalpha,isi},
  addendum = {\textcolor{brown}{\emph{The highlighted authors are joint first authors with equal contributions.}}},
}

2021

JOCS

A Lightweight Method for Evaluating In Situ Workflow Efficiency

Tu Mai Anh Do, Loïc Pottier, Silvina Caíno-Lores, Rafael Ferreira da Silva, Michel A. Cuendet, Harel Weinstein, and 3 more authors

Journal of Computational Science 2021

Bib

@article{do2021jocs,
  title = {A Lightweight Method for Evaluating In Situ Workflow Efficiency},
  author = {Do, Tu Mai Anh and Pottier, Lo{\"i}c and Ca\'ino-Lores, Silvina and {Ferreira da Silva}, Rafael and Cuendet, Michel A. and Weinstein, Harel and Estrada, Trilce and Taufer, Michela and Deelman, Ewa},
  journal = {Journal of Computational Science},
  volume = {48},
  number = {},
  year = {2021},
  doi = {10.1016/j.jocs.2020.101259},
  keywords = {mine,isi},
  note = {Funding Acknowledgments: NSF 1741040, DOE DE-SC0012636},
}

WORKS

A Performance Characterization of Scientific Machine Learning Workflows

Patrycja Krawczuk, George Papadimitriou, Ryan Tanaka, Tu Mai Anh Do, Srujana Subramany, Shubham Nagarkar, and 5 more authors

In 2021 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS) 2021

Bib

@inproceedings{krawczuk-works-2021,
  title = {A Performance Characterization of Scientific Machine Learning Workflows},
  author = {Krawczuk, Patrycja and Papadimitriou, George and Tanaka, Ryan and Do, Tu Mai Anh and Subramany, Srujana and Nagarkar, Shubham and Jain, Aditi and Lam, Kelsie and Mandal, Anirban and Pottier, Loïc and Deelman, Ewa},
  booktitle = {2021 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS)},
  year = {2021},
  pages = {58-65},
  keywords = {mine,workshop,isi},
  doi = {10.1109/WORKS54523.2021.00013},
  note = {Funding Acknowledgments: DOE DE-SC0012636, NSF 1664162},
}

ICPP

Assessing Resource Provisioning and Allocation of Ensembles of In Situ Workflows

Tu Mai Anh Do, Loïc Pottier, Rafael Ferreira da Silva, Silvina Caı́no-Lores, Michela Taufer, and Ewa Deelman

In 50th International Conference on Parallel Processing Workshop 2021

Bib HTML

@inproceedings{do2021p2s2,
  author = {Do, Tu Mai Anh and Pottier, Loïc and {Ferreira da Silva}, Rafael and Ca\'{\i}no-Lores, Silvina and Taufer, Michela and Deelman, Ewa},
  title = {Assessing Resource Provisioning and Allocation of Ensembles of In Situ Workflows},
  year = {2021},
  isbn = {9781450384414},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/3458744.3474051},
  booktitle = {50th International Conference on Parallel Processing Workshop},
  articleno = {38},
  numpages = {10},
  location = {Lemont, IL, USA},
  series = {ICPP Workshops '21},
  keywords = {mine,workshop,isi},
  note = {Funding Acknowledgments: NSF 1741040, DOE SC0012636},
}

2020

ICCS

A Novel Metric to Evaluate In Situ Workflows

Tu Mai Anh Do, Loïc Pottier, Stephen Thomas, Rafael Ferreira da Silva, Michel A. Cuendet, Harel Weinstein, and 3 more authors

In International Conference on Computational Science (ICCS) 2020

Bib

@inproceedings{do2020iccs,
  author = {Do, Tu Mai Anh and Pottier, Lo{\"i}c and Thomas, Stephen and {Ferreira da Silva}, Rafael and Cuendet, Michel A. and Weinstein, Harel and Estrada, Trilce and Taufer, Michela and Deelman, Ewa},
  title = {A Novel Metric to Evaluate In Situ Workflows},
  booktitle = {International Conference on Computational Science (ICCS)},
  year = {2020},
  pages = {538--553},
  keywords = {mine,isi},
  doi = {10.1007/978-3-030-50371-0_40},
  note = {Funding Acknowledgments: NSF 1741040},
}

WORKS

WorkflowHub: Community Framework for Enabling Scientific Workflow Research and Development

Rafael Ferreira da Silva, Loïc Pottier, Tainã Coleman, Ewa Deelman, and Henri Casanova

In 2020 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS) 2020

Bib

@inproceedings{ferreiradasilva2020works,
  title = {WorkflowHub: Community Framework for Enabling Scientific Workflow Research and Development},
  author = {{Ferreira da Silva}, Rafael and Pottier, Lo\"ic and Coleman, Tain\~a and Deelman, Ewa and Casanova, Henri},
  booktitle = {2020 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS)},
  year = {2020},
  pages = {49--56},
  keywords = {mine,workshop,isi},
  doi = {10.1109/WORKS51914.2020.00012},
  note = {Funding Acknowledgments: NSF 2016619, DOE DE-SC0012636, NSF 1664162, NSF 1923539},
}

Cluster

Modeling the Performance of Scientific Workflow Executions on HPC Platforms with Burst Buffers

Loïc Pottier, Rafael Ferreira da Silva, Henri Casanova, and Ewa Deelman

In 2020 IEEE International Conference on Cluster Computing (CLUSTER) 2020

Bib

@inproceedings{pottier2020cluster,
  author = {Pottier, Lo{\"i}c and {Ferreira da Silva}, Rafael and Casanova, Henri and Deelman, Ewa},
  title = {Modeling the Performance of Scientific Workflow Executions on HPC Platforms with Burst Buffers},
  booktitle = {2020 IEEE International Conference on Cluster Computing (CLUSTER)},
  year = {2020},
  pages = {92--103},
  doi = {10.1109/CLUSTER49012.2020.00019},
  keywords = {mine,isi},
  note = {Funding Acknowledgments: DOE DE-SC0012636, NSF 1664162, NSF 1741040, NSF 1923539, NSF 1923621},
}

2019

IJHPCA
Co-scheduling HPC workloads on cache-partitioned CMP platforms

Guillaume Aupy, Anne Benoit, Brice Goglin, Loïc Pottier, and Yves Robert

International Journal of High Performance Computing Applications Apr 2019

Abs Bib HTML

With the recent advent of many-core architectures such as chip multiprocessors (CMPs), the number of processing units accessing a global shared memory is constantly increasing. Co-scheduling techniques are used to improve application throughput on such architectures, but sharing resources often generates critical interferences. In this article, we focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer to the following questions: (i) How to precisely model the behavior of these applications on the cache-partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC applications are co-scheduled onto CMP platforms.
@article{ijhpca2019cmp, author = {Aupy, Guillaume and Benoit, Anne and Goglin, Brice and Pottier, Lo{\"i}c and Robert, Yves}, year = {2019}, month = apr, title = {Co-scheduling HPC workloads on cache-partitioned CMP platforms}, journal = {International Journal of High Performance Computing Applications}, pages = {0}, volume = {0}, number = {0}, doi = {10.1177/1094342019846956}, keywords = {mine,ensl}, }

eScience

Cyberinfrastructure Center of Excellence Pilot: Connecting Large Facilities Cyberinfrastructure

Ewa Deelman, Anirban Mandal, Valerio Pascucci, Susan Sons, Jane Wyngaard, Charles F Vardeman II, and 25 more authors

In 15th International Conference on eScience (eScience) Apr 2019

Bib

@inproceedings{deelman-escience-2019,
  title = {Cyberinfrastructure Center of Excellence Pilot: Connecting Large Facilities Cyberinfrastructure},
  author = {Deelman, Ewa and Mandal, Anirban and Pascucci, Valerio and Sons, Susan and Wyngaard, Jane and Vardeman II, Charles F and Petruzza, Steve and Baldin, Ilya and Christopherson, Laura and Mitchell, Ryan and Pottier, Lo{\"i}c and Rynge, Mats and Scott, Erik and Vahi, Karan and Kogank, Marina and Mann, Jasmine A and Gulbransen, Tom and Allen, Daniel and Barlow, David and Bonarrigo, Santiago and Clark, Chris and Goldman, Leslie and Goulden, Tristan and Harvey, Phil and Hulsander, David and Jacob, Steve and Laney, Christine and Lobo-Padilla, Ivan and Sampson, Jeremey and Staarmann, John and Stone, Steve},
  booktitle = {15th International Conference on eScience (eScience)},
  year = {2019},
  location = {San Diego, CA, USA},
  pages = {},
  doi = {},
  note = {Funding Acknowledgments: NSF 1842042},
  keywords = {mine,isi},
}

BigData

Exploration of Workflow Management Systems Emerging Features from Users Perspectives

Ryan Mitchell, Loïc Pottier, Steve Jacobs, Rafael Ferreira da Silva, Mats Rynge, Karan Vahi, and 1 more author

In First International Workshop on Big Data Tools, Methods, and Use Cases for Innovative Scientific Discovery (BTSD) Apr 2019

Bib

@inproceedings{mitchell2019btsd,
  title = {Exploration of Workflow Management Systems Emerging Features from Users Perspectives},
  author = {Mitchell, Ryan and Pottier, Lo{\"i}c and Jacobs, Steve and {Ferreira da Silva}, Rafael and Rynge, Mats and Vahi, Karan and Deelman, Ewa},
  booktitle = {First International Workshop on Big Data Tools, Methods, and Use Cases for Innovative Scientific Discovery (BTSD)},
  year = {2019},
  pages = {},
  doi = {},
  note = {Funding Acknowledgments: NSF 1842042},
  keywords = {mine,workshop,isi},
}

eScience

Characterization of In Situ and In Transit Analytics of Molecular Dynamics Simulations for Next-generation Supercomputers

Stephen Thomas, Michael Wyatt, Tu Mai Anh Do, Loïc Pottier, Rafael Ferreira da Silva, Harel Weinstein, and 4 more authors

In 15th International Conference on eScience (eScience) Apr 2019

Bib

@inproceedings{thomas-escience-2019,
  title = {Characterization of In Situ and In Transit Analytics of Molecular Dynamics Simulations for Next-generation Supercomputers},
  author = {Thomas, Stephen and Wyatt, Michael and Do, Tu Mai Anh and Pottier, Lo{\"i}c and {Ferreira da Silva}, Rafael and Weinstein, Harel and Cuendet, Michel A. and Estrada, Trilce and Deelman, Ewa and Taufer, Michela},
  booktitle = {15th International Conference on eScience (eScience)},
  year = {2019},
  pages = {188--198},
  doi = {10.1109/eScience.2019.00027},
  note = {Funding Acknowledgments: NSF 1741040},
  keywords = {mine,isi},
}

2018

Thesis

Co-scheduling for large-scale applications: memory and resilience

Loïc Pottier

Sep 2018

Bib HTML

@phdthesis{pottier2018,
  title = {{Co-scheduling for large-scale applications: memory and resilience}},
  author = {Pottier, Lo{\"i}c},
  url = {https://tel.archives-ouvertes.fr/tel-01892395},
  number = {2018LYSEN039},
  school = {{Universit{\'e} de Lyon}},
  year = {2018},
  month = sep,
  keywords = {mine,ensl, Co-scheduling algorithm, Memory hierarchy, Cache memory, Scheduling, Resilience, High performance computing, HPC, Memory, Many-core, Ordonnancement concurrent, Hi{\'e}rarchie m{\'e}moire, Algorithme d'ordonnancement, R{\'e}silience, Informatique haute performance, Ant{\'e}m{\'e}moire},
  type = {Theses},
  hal_id = {tel-01892395},
  hal_version = {v1},
}

IJHPCA
Resilient co-scheduling of malleable applications

Anne Benoit, Loïc Pottier, and Yves Robert

International Journal of High Performance Computing and Applications Sep 2018

Abs Bib HTML

Recently, the benefits of co-scheduling several applications have been demonstrated in a fault-free context, both in terms of performance and energy savings. However, large-scale computer systems are confronted by frequent failures, and resilience techniques must be employed for large applications to execute efficiently. Indeed, failures may create severe imbalance between applications and significantly degrade performance. In this article, we aim at minimizing the expected completion time of a set of co-scheduled applications. We propose to redistribute the resources assigned to each application upon the occurrence of failures, and upon the completion of some applications, in order to achieve this goal. First, we introduce a formal model and establish complexity results. The problem is NP-complete for malleable applications, even in a fault-free context. Therefore, we design polynomial-time heuristics that perform redistributions and account for processor failures. A fault simulator is used to perform extensive simulations that demonstrate the usefulness of redistribution and the performance of the proposed heuristics.
@article{ijhpca2018resilience, author = {Benoit, Anne and Pottier, Lo{\"i}c and Robert, Yves}, title = {Resilient co-scheduling of malleable applications}, journal = {International Journal of High Performance Computing and Applications}, volume = {32}, number = {1}, pages = {89--103}, year = {2018}, doi = {10.1177/1094342017704979}, eprint = {https://doi.org/10.1177/1094342017704979}, keywords = {mine,ensl}, }
IJHPCA
Co-scheduling Amdahl applications on cache-partitioned systems

Guillaume Aupy, Anne Benoit, Sicheng Dai, Loïc Pottier, Padma Raghavan, Yves Robert, and 1 more author

International Journal of High Performance Computing and Applications Sep 2018

Abs Bib HTML

Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are as follows: (i) which proportion of cache and (ii) how many processors should be given to each application? In this article, we provide answers to (i) and (ii) for Amdahl applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for Amdahl applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.
@article{ijhpca2018cache, author = {Aupy, Guillaume and Benoit, Anne and Dai, Sicheng and Pottier, Lo{\"i}c and Raghavan, Padma and Robert, Yves and Shantharam, Manu}, title = {Co-scheduling Amdahl applications on cache-partitioned systems}, journal = {International Journal of High Performance Computing and Applications}, volume = {32}, number = {1}, pages = {123--138}, year = {2018}, doi = {10.1177/1094342017710806}, keywords = {mine,ensl}, }
Cluster
Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms

Guillaume Aupy, Anne Benoit, Brice Goglin, Loïc Pottier, and Yves Robert

In IEEE International Conference on Cluster Computing, CLUSTER 2018, Belfast, UK, September 10-13, 2018 Sep 2018

Abs Bib HTML

Co-scheduling techniques are used to improve the throughput of applications on chip multiprocessors (CMP), but sharing resources often generates critical interferences. We focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer the following questions: (i) how to precisely model the behavior of these applications on the cache partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC application are co-scheduled onto CMP platforms.
@inproceedings{cluster18, author = {Aupy, Guillaume and Benoit, Anne and Goglin, Brice and Pottier, Lo{\"i}c and Robert, Yves}, title = {Co-Scheduling {HPC} Workloads on Cache-Partitioned {CMP} Platforms}, booktitle = {{IEEE} International Conference on Cluster Computing, {CLUSTER} 2018, Belfast, UK, September 10-13, 2018}, pages = {348--358}, publisher = {{IEEE} Computer Society}, year = {2018}, doi = {10.1109/CLUSTER.2018.00052}, keywords = {mine,ensl}, }
ICPP
A Performance Model to Execute Workflows on High-Bandwidth-Memory Architectures

Anne Benoit, Swann Perarnau, Loïc Pottier, and Yves Robert

In Proceedings of the 47th International Conference on Parallel Processing, ICPP 2018, Eugene, OR, USA, August 13-16, 2018 Sep 2018

Abs Bib HTML

This work presents a realistic performance model to execute scientific workflows on high-bandwidth-memory architectures such as the Intel Knights Landing. We provide a detailed analysis of the execution time on such platforms, taking into account transfers from both fast and slow memory and their overlap with computations. We discuss several scheduling and mapping strategies: not only tasks must be assigned to computing resources, but also one has to decide which fraction of input and output data will reside in fast memory and which will have to stay in slow memory. We use extensive simulations to assess the impact of the mapping strategies on performance. We also conduct experiments for a simple 1D Gauss-Seidel kernel, which assess the accuracy of the model and further demonstrate the importance of a tuned memory management. Our model and results lay the foundations for further studies and experiments on dual-memory systems.
@inproceedings{icpp2018, author = {Benoit, Anne and Perarnau, Swann and Pottier, Lo{\"i}c and Robert, Yves}, title = {A Performance Model to Execute Workflows on High-Bandwidth-Memory Architectures}, booktitle = {Proceedings of the 47th International Conference on Parallel Processing, {ICPP} 2018, Eugene, OR, USA, August 13-16, 2018}, pages = {36:1--36:10}, articleno = {36}, publisher = {{ACM}}, address = {New York, NY, USA}, acmid = {3225110}, isbn = {978-1-4503-6510-9}, location = {Eugene, OR, USA}, year = {2018}, doi = {10.1145/3225058.3225110}, keywords = {mine,ensl}, }

2017

Book
Co-scheduling high-performance computing applications

Guillaume Aupy, Anne Benoit, Loïc Pottier, Padma Raghavan, Yves Robert, and Manu Shantharam

May 2017

Abs Bib HTML

Big data applications play an increasing role in high-performance computing. They are perfect candidates for co-scheduling, as they obey flexible speedup models, alternating I/O operations and intensive computation phases. In this chapter, we discuss co-scheduling on failure-prone platforms. Checkpointing helps to mitigate the impact of a failure on a given application, but it must be complemented by redistributions to rebalance the load among all applications. Co-scheduling usually involves partitioning the applications into packs, and then scheduling each pack in sequence, as efficiently as possible. The objective is therefore to determine a partition into packs, and an assignment of processors to applications, that minimize the sum of the execution times of the packs. On the theoretical side, we assess the problem complexity. On the practical side, we design several polynomial-time heuristics to deal with the general problem with failures and redistribution costs. The proposed heuristics show very good performance while executing in very short time, hence validating the approach.
@incollection{chapter2016crc, author = {Aupy, Guillaume and Benoit, Anne and Pottier, Lo{\"i}c and Raghavan, Padma and Robert, Yves and Shantharam, Manu}, booktitle = {Big Data Management and Processing}, publisher = {Chapman and Hall/CRC Press}, editor = {Li, Kuan-Ching and Jiang, Hai and Zomaya, Albert}, title = {{Co-scheduling high-performance computing applications}}, chapter = {5}, pages = {81--104}, month = may, year = {2017}, doi = {10.1201/9781315154008-5}, keywords = {mine,ensl}, }
APDCM
Co-Scheduling Algorithms for Cache-Partitioned Systems

Guillaume Aupy, Anne Benoit, Loïc Pottier, Padma Raghavan, Yves Robert, and Manu Shantharam

In 2017 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPS Workshops 2017, Orlando / Buena Vista, FL, USA, May 29 - June 2, 2017 May 2017

Abs Bib HTML

Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are: (i) which proportion of cache and (ii) how many processors should be given to each application? Here, we assign rational numbers of processors to each application, since they can be shared across applications through multi-threading. In this paper, we provide answers to (i) and (ii) for perfectly parallel applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for general applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.
@inproceedings{apdcm2017, author = {Aupy, Guillaume and Benoit, Anne and Pottier, Lo{\"i}c and Raghavan, Padma and Robert, Yves and Shantharam, Manu}, title = {Co-Scheduling Algorithms for Cache-Partitioned Systems}, booktitle = {2017 {IEEE} International Parallel and Distributed Processing Symposium Workshops, {IPDPS} Workshops 2017, Orlando / Buena Vista, FL, USA, May 29 - June 2, 2017}, pages = {874--883}, publisher = {{IEEE} Computer Society}, year = {2017}, month = may, location = {Lake Buena Vista, FL, USA}, doi = {10.1109/IPDPSW.2017.60}, issn = {}, isbn = {978-1-5386-3408-0}, keywords = {mine,workshop,ensl}, }

2016

ICPP
Resilient Application Co-scheduling with Processor Redistribution

Anne Benoit, Loïc Pottier, and Yves Robert

In 45th International Conference on Parallel Processing, ICPP 2016, Philadelphia, PA, USA, August 16-19, 2016 Aug 2016

Abs Bib HTML

Recently, the benefits of co-scheduling several applications have been demonstrated in a fault-free context, both in terms of performance and energy savings. However, large-scale computer systems are confronted to frequent failures, and resilience techniques must be employed to ensure the completion of large applications. Indeed, failures may create severe imbalance between applications, and significantly degrade performance. In this paper, we propose to redistribute the resources assigned to each application upon the striking of failures, in order to minimize the expected completion time of a set of co-scheduled applications. First, we introduce a formal model and establish complexity results. When no redistribution is allowed, we can minimize the expected completion time in polynomial time, while the problem becomes NP-complete with redistributions, even in a fault-free context. Therefore, we design polynomial-time heuristics that perform redistributions and account for processor failures. A fault simulator is used to perform extensive simulations that demonstrate the usefulness of redistribution and the performance of the proposed heuristics.
@inproceedings{icpp2016, author = {Benoit, Anne and Pottier, Lo{\"i}c and Robert, Yves}, title = {Resilient Application Co-scheduling with Processor Redistribution}, booktitle = {45th International Conference on Parallel Processing, {ICPP} 2016, Philadelphia, PA, USA, August 16-19, 2016}, pages = {123--132}, publisher = {{IEEE} Computer Society}, year = {2016}, month = aug, location = {Philadelphia, PA, USA}, doi = {10.1109/ICPP.2016.21}, issn = {2332-5690}, isbn = {978-1-5090-2823-8}, keywords = {mine,ensl}, }