Skip to content

Commit

Permalink
Assign reducer ops in task assigner to make them more balanced across…
Browse files Browse the repository at this point in the history
… cluster (#3048)
  • Loading branch information
Xuye (Chris) Qin authored May 18, 2022
1 parent 7840183 commit 08d7cfe
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 21 deletions.
2 changes: 1 addition & 1 deletion benchmarks/tpch/run_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ def g2(x):
def q13(customer, orders):
customer_filtered = customer.loc[:, ["C_CUSTKEY"]]
orders_filtered = orders[
~orders["O_COMMENT"].str.contains("special(\S|\s)*requests")
~orders["O_COMMENT"].str.contains("special[\S|\s]*requests")
]
orders_filtered = orders_filtered.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]]
c_o_merged = customer_filtered.merge(
Expand Down
4 changes: 3 additions & 1 deletion mars/core/operand/shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ class ShuffleProxy(VirtualOperand):


class MapReduceOperand(Operand):
reducer_index = TupleField("reducer_index", FieldTypes.uint64)
# for mapper
mapper_id = Int32Field("mapper_id", default=0)
# for reducer
reducer_index = TupleField("reducer_index", FieldTypes.uint64)
reducer_phase = StringField("reducer_phase", default=None)

def _new_chunks(self, inputs, kws=None, **kw):
Expand Down
3 changes: 3 additions & 0 deletions mars/dataframe/groupby/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,9 +806,12 @@ def _choose_tree_method(
len(ctx.get_worker_addresses()) > 1
and estimate_size > chunk_store_limit
and np.mean(agg_sizes) > 1024**2
and total_count <= 256
):
# for distributed, if estimate size could be potentially large,
# and each chunk size is large enough(>1M, small chunk means large error),
# total count is relatively small(<=256, large number of chunks
# is not quite efficient for shuffle)
# we choose to use shuffle
return False
# calculate the coefficient of variation of aggregation sizes,
Expand Down
26 changes: 23 additions & 3 deletions mars/services/task/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@

from ....config import Config
from ....core import ChunkGraph, ChunkType, enter_mode
from ....core.operand import Fetch, VirtualOperand, LogicKeyGenerator
from ....core.operand import (
Fetch,
VirtualOperand,
LogicKeyGenerator,
MapReduceOperand,
OperandStage,
)
from ....resource import Resource
from ....typing import BandType
from ....typing import BandType, OperandType
from ....utils import build_fetch, tokenize
from ...subtask import SubtaskGraph, Subtask
from ..core import Task, new_task_id
Expand All @@ -31,6 +37,18 @@
logger = logging.getLogger(__name__)


def need_reassign_worker(op: OperandType) -> bool:
# NOTE(qinxuye): special process for reducer
# We'd better set reducer op's stage to reduce, however,
# in many case, we copy a reducer op from tileable op,
# then set stage as reducer one,
# it would be quite nasty to take over the __setattr__ and
# make reassign_worker True etc.
return op.reassign_worker or (
isinstance(op, MapReduceOperand) and op.stage == OperandStage.reduce
)


class GraphAnalyzer:
def __init__(
self,
Expand Down Expand Up @@ -294,8 +312,10 @@ def gen_subtask_graph(
subtask_graph: SubtaskGraph
Subtask graph.
"""
# reassign worker when specified reassign_worker = True
# or it's a reducer operands
reassign_worker_ops = [
chunk.op for chunk in self._chunk_graph if chunk.op.reassign_worker
chunk.op for chunk in self._chunk_graph if need_reassign_worker(chunk.op)
]
start_ops = (
list(self._iter_start_ops(self._chunk_graph))
Expand Down
53 changes: 38 additions & 15 deletions mars/services/task/analyzer/assigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from operator import itemgetter
from typing import List, Dict, Set, Union
from typing import List, Dict, Union

import numpy as np

from ....core import ChunkGraph, ChunkData
from ....core.operand import Operand
from ....lib.ordered_set import OrderedSet
from ....resource import Resource
from ....typing import BandType
from ....utils import implements
Expand Down Expand Up @@ -77,8 +78,9 @@ def __init__(
band_resource: Dict[BandType, Resource],
):
super().__init__(chunk_graph, start_ops, band_resource)
self._undirected_chunk_graph = None
self._op_keys: Set[str] = {start_op.key for start_op in start_ops}
self._op_keys: OrderedSet[str] = OrderedSet(
[start_op.key for start_op in start_ops]
)

def _calc_band_assign_limits(
self, initial_count: int, occupied: Dict[BandType, int]
Expand Down Expand Up @@ -124,13 +126,15 @@ def _calc_band_assign_limits(
pos = (pos + 1) % len(counts)
return dict(zip(bands, counts))

@classmethod
def _assign_by_bfs(
self,
cls,
undirected_chunk_graph: ChunkGraph,
start: ChunkData,
band: BandType,
initial_sizes: Dict[BandType, int],
spread_limits: Dict[BandType, float],
key_to_assign: Set[str],
key_to_assign: OrderedSet[str],
assigned_record: Dict[str, Union[str, BandType]],
):
"""
Expand All @@ -140,19 +144,15 @@ def _assign_by_bfs(
if initial_sizes[band] <= 0:
return

graph = self._chunk_graph
if self._undirected_chunk_graph is None:
self._undirected_chunk_graph = graph.build_undirected()
undirected_chunk_graph = self._undirected_chunk_graph

assigned = 0
spread_range = 0
for chunk in undirected_chunk_graph.bfs(start=start, visit_predicate="all"):
op_key = chunk.op.key
if op_key in assigned_record:
continue
spread_range += 1
# `op_key` may not be in `key_to_assign`, but we need to record it to avoid iterate the node repeatedly.
# `op_key` may not be in `key_to_assign`,
# but we need to record it to avoid iterate the node repeatedly.
assigned_record[op_key] = band
if op_key not in key_to_assign:
continue
Expand All @@ -161,8 +161,22 @@ def _assign_by_bfs(
break
initial_sizes[band] -= assigned

def _build_undirected_chunk_graph(
self, chunk_to_assign: List[ChunkData]
) -> ChunkGraph:
chunk_graph = self._chunk_graph.copy()
# remove edges for all chunk_to_assign which may contain chunks
# that need be reassigned
for chunk in chunk_to_assign:
if chunk_graph.count_predecessors(chunk) > 0:
for pred in list(chunk_graph.predecessors(chunk)):
chunk_graph.remove_edge(pred, chunk)
return chunk_graph.build_undirected()

@implements(AbstractGraphAssigner.assign)
def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType]:
def assign(
self, cur_assigns: Dict[str, BandType] = None
) -> Dict[ChunkData, BandType]:
graph = self._chunk_graph
assign_result = dict()
cur_assigns = cur_assigns or dict()
Expand All @@ -173,7 +187,7 @@ def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType
for chunk in graph:
op_key_to_chunks[chunk.op.key].append(chunk)

op_keys = set(self._op_keys)
op_keys = OrderedSet(self._op_keys)
chunk_to_assign = [
op_key_to_chunks[op_key][0]
for op_key in op_keys
Expand All @@ -183,6 +197,9 @@ def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType
for band in cur_assigns.values():
assigned_counts[band] += 1

# build undirected graph
undirected_chunk_graph = self._build_undirected_chunk_graph(chunk_to_assign)

# calculate the number of chunks to be assigned to each band
# given number of bands and existing assignments
band_quotas = self._calc_band_assign_limits(
Expand All @@ -195,14 +212,20 @@ def assign(self, cur_assigns: Dict[str, str] = None) -> Dict[ChunkData, BandType
spread_ranges = defaultdict(lambda: average_spread_range)
# assign from other chunks to be assigned
# TODO: sort by what?
sorted_candidates = [v for v in chunk_to_assign]
sorted_candidates = chunk_to_assign.copy()
while max(band_quotas.values()):
band = max(band_quotas, key=lambda k: band_quotas[k])
cur = sorted_candidates.pop()
while cur.op.key in cur_assigns:
cur = sorted_candidates.pop()
self._assign_by_bfs(
cur, band, band_quotas, spread_ranges, op_keys, cur_assigns
undirected_chunk_graph,
cur,
band,
band_quotas,
spread_ranges,
op_keys,
cur_assigns,
)

key_to_assign = {n.op.key for n in chunk_to_assign} | initial_assigned_op_keys
Expand Down
37 changes: 36 additions & 1 deletion mars/services/task/analyzer/tests/test_assigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,19 @@
# limitations under the License.

import numpy as np
import pandas as pd

from ..... import dataframe as md
from .....config import Config
from .....core import ChunkGraph
from .....core.graph.builder.utils import build_graph
from .....core.operand import OperandStage
from .....tensor.random import TensorRand
from .....tensor.arithmetic import TensorAdd
from .....tensor.fetch import TensorFetch
from .....resource import Resource
from ...core import Task
from ..analyzer import GraphAnalyzer
from ..analyzer import GraphAnalyzer, need_reassign_worker
from ..assigner import GraphAssigner


Expand Down Expand Up @@ -71,3 +75,34 @@ def test_assigner_with_fetch_inputs():
for inp in input_chunks:
if not isinstance(inp.op, TensorFetch):
assert subtask.expect_band == key_to_assign[inp.key]


def test_shuffle_assign():
band_num = 8
all_bands = [(f"address_{i}", "numa-0") for i in range(band_num)]

pdf = pd.DataFrame(np.random.rand(32, 4))
df = md.DataFrame(pdf, chunk_size=4)
r = df.groupby(0).sum(method="shuffle")
chunk_graph = build_graph([r], tile=True)

band_resource = dict((band, Resource(num_cpus=1)) for band in all_bands)

reassign_worker_ops = [
chunk.op for chunk in chunk_graph if need_reassign_worker(chunk.op)
]
start_ops = list(GraphAnalyzer._iter_start_ops(chunk_graph))
to_assign_ops = start_ops + reassign_worker_ops

assigner = GraphAssigner(chunk_graph, to_assign_ops, band_resource)
assigns = assigner.assign()
assert len(assigns) == 16
init_assigns = set()
reducer_assigns = set()
for chunk, assign in assigns.items():
if chunk.op.stage == OperandStage.reduce:
reducer_assigns.add(assign)
else:
init_assigns.add(assign)
# init and reducers are assigned on all bands
assert len(init_assigns) == len(reducer_assigns) == 8

0 comments on commit 08d7cfe

Please sign in to comment.