diff --git a/Makefile b/Makefile index c43770ce8d4c..afa834416263 100644 --- a/Makefile +++ b/Makefile @@ -39,14 +39,14 @@ build: pd-server pd-ctl pd-tso-bench pd-recover pd-server: export GO111MODULE=on pd-server: ifeq ("$(WITH_RACE)", "1") - CGO_ENABLED=1 go build -race -ldflags '$(LDFLAGS)' -o bin/pd-server cmd/pd-server/main.go + CGO_ENABLED=1 go build -race -gcflags '$(GCFLAGS)' -ldflags '$(LDFLAGS)' -o bin/pd-server cmd/pd-server/main.go else - CGO_ENABLED=0 go build -ldflags '$(LDFLAGS)' -o bin/pd-server cmd/pd-server/main.go + CGO_ENABLED=0 go build -gcflags '$(GCFLAGS)' -ldflags '$(LDFLAGS)' -o bin/pd-server cmd/pd-server/main.go endif pd-ctl: export GO111MODULE=on pd-ctl: - CGO_ENABLED=0 go build -ldflags '$(LDFLAGS)' -o bin/pd-ctl tools/pd-ctl/main.go + CGO_ENABLED=0 go build -gcflags '$(GCFLAGS)' -ldflags '$(LDFLAGS)' -o bin/pd-ctl tools/pd-ctl/main.go pd-tso-bench: export GO111MODULE=on pd-tso-bench: CGO_ENABLED=0 go build -o bin/pd-tso-bench tools/pd-tso-bench/main.go diff --git a/conf/config.toml b/conf/config.toml index 99ac0065da17..3b3289eb61cd 100644 --- a/conf/config.toml +++ b/conf/config.toml @@ -67,8 +67,7 @@ leader-schedule-limit = 4 region-schedule-limit = 64 replica-schedule-limit = 64 merge-schedule-limit = 8 -#tolerant-size-ratio = 5.0 -# Enable two-way merge, set it to true may help improving merge speed. +#tolerant-size-ratio = 0.0 #enable-one-way-merge = false # customized schedulers, the format is as below diff --git a/docs/api.html b/docs/api.html index 46f0332c653e..8c3df3a9d11d 100644 --- a/docs/api.html +++ b/docs/api.html @@ -166,4 +166,4 @@ .resource-modal li > ul { margin-bottom: 1em; } -

/cluster/status

Cluster status.

get

Get cluster status.

/version

The version of PD server.

get

Get the version of PD server.

/status

The build info of PD server.

get

Get the build info of PD server.

/diagnose

Diagnostic information of the cluster.

get

/members

The PD servers in the cluster.

get

List all PD servers in the cluster.

A specific PD server.

delete

Remove a PD server from the cluster.

post

Set leader priority of a PD member.

A specific PD server.

delete

Remove a PD server from the cluster.

/leader

The leader PD server of the cluster.

get

Get the leader PD server of the cluster.

post

Transfer leadership to another PD server.

post

Transfer leadership to the specific PD server.

/health

Health status of PD servers.

get

/config

PD cluster configuration.

get

Get full config.

post

Update a config item.

Schedule configuration.

get

Get schedule config.

post

Update a schedule config item.

Replication configuration.

get

Get replication config.

post

Update a replication config item.

The config of a namespace.

get

Get configuration of a namespace.

post

Update a namespace config item.

delete

Delete a namespace config.

The label property configuration.

get

Get label property config.

post

Update label property config item.

/stores

The stores in the cluster.

get

Get stores in the cluster.

/store/{storeId}

A specific store.

get

Get a store's information.

delete

Take down a store from the cluster.

The specific store's state.

post

Set the store's state.

The specific store's label.

post

Set the store's label.

The specific store's weight.

post

Set the store's leader/region weight.

/labels

The store label values in the cluster.

get

List all label values.

get

List stores that have specific label values.

/region

A specific region in the cluster.

get

Search for a region by region ID.

get

Search for a region by a key.

/regions

The regions in the cluster.

get

List all regions in the cluster.

get

List regions with the highest write flow.

get

List regions with the highest read flow.

get

List regions with the largest conf version.

get

List regions with the largest version.

get

List regions with the largest size.

get

List regions start from a key.

get

List regions with unhealthy status.

get

List sibling regions of a specific region.

get

List all regions of a specific store.

/schedulers

Running schedulers.

get

List running schedulers.

post

Create a scheduler.

A specific scheduler.

delete

Delete a scheduler.

/operators

Pending operators.

get

List pending operators.

post

Create an operator.

A specific Region's pending operator.

get

Get a Region's pending operator.

delete

Cancel a Region's pending operator.

/hotspot

The hot spots status in the cluster.

get

List the hot write regions.

get

List the hot read regions.

get

List the hot stores.

/stats

Statistics of the cluster.

get

Get region statistics of a specified range.

/trend

Trend of data growth and movements.

get

Get the growth and changes of data in the most recent period of time.

/admin

delete

Drop a specific region from cache.

The log level of PD server.

post

Set log level.

/classifier

The namespace classifier. Methods depend on current classifier.

\ No newline at end of file +

/cluster/status

Cluster status.

get

Get cluster status.

/version

The version of PD server.

get

Get the version of PD server.

/status

The build info of PD server.

get

Get the build info of PD server.

/diagnose

Diagnostic information of the cluster.

get

/members

The PD servers in the cluster.

get

List all PD servers in the cluster.

A specific PD server.

delete

Remove a PD server from the cluster.

post

Set leader priority of a PD member.

A specific PD server.

delete

Remove a PD server from the cluster.

/leader

The leader PD server of the cluster.

get

Get the leader PD server of the cluster.

post

Transfer leadership to another PD server.

post

Transfer leadership to the specific PD server.

/health

Health status of PD servers.

get

/config

PD cluster configuration.

get

Get full config.

post

Update a config item.

Schedule configuration.

get

Get schedule config.

post

Update a schedule config item.

Replication configuration.

get

Get replication config.

post

Update a replication config item.

The config of a namespace.

get

Get configuration of a namespace.

post

Update a namespace config item.

delete

Delete a namespace config.

The label property configuration.

get

Get label property config.

post

Update label property config item.

/stores

The stores in the cluster.

get

Get stores in the cluster.

The balance rate limit for all stores.

get

Get all stores' balance rate limit.

post

Set all stores' balance rate limit.

Remove all tombstone stores.

delete

Remove all tombstone stores.

/store/{storeId}

A specific store.

get

Get a store's information.

delete

Take down a store from the cluster.

The state for the specific store.

post

Set the store's state.

The label for the specific store.

post

Set the store's label.

The weight for the specific store.

post

Set the store's leader/region weight.

The balance rate limit for the specific store.

post

Set the store's balance rate limit.

/labels

The store label values in the cluster.

get

List all label values.

get

List stores that have specific label values.

/region

A specific region in the cluster.

get

Search for a region by region ID.

get

Search for a region by a key.

/regions

The regions in the cluster.

get

List all regions in the cluster.

get

List regions with the highest write flow.

get

List regions with the highest read flow.

get

List regions with the largest conf version.

get

List regions with the largest version.

get

List regions with the largest size.

get

List regions start from a key.

get

List regions with unhealthy status.

get

List sibling regions of a specific region.

get

List all regions of a specific store.

/schedulers

Running schedulers.

get

List running schedulers.

post

Create a scheduler.

A specific scheduler.

delete

Delete a scheduler.

/operators

Pending operators.

get

List pending operators.

post

Create an operator.

A specific Region's pending operator.

get

Get a Region's pending operator.

delete

Cancel a Region's pending operator.

/hotspot

The hot spots status in the cluster.

get

List the hot write regions.

get

List the hot read regions.

get

List the hot stores.

/stats

Statistics of the cluster.

get

Get region statistics of a specified range.

/trend

Trend of data growth and movements.

get

Get the growth and changes of data in the most recent period of time.

/admin

delete

Drop a specific region from cache.

The log level of PD server.

post

Set log level.

/classifier

The namespace classifier. Methods depend on current classifier.

\ No newline at end of file diff --git a/server/api/api.raml b/server/api/api.raml index 6612ce9f68dd..e7ac091b6743 100644 --- a/server/api/api.raml +++ b/server/api/api.raml @@ -71,9 +71,13 @@ types: region-schedule-limit?: integer replica-schedule-limit?: integer merge-schedule-limit?: integer + hot-region-schedule-limit?: integer + hot-region-cache-hits-threshold?: integer + store-balance-rate?: number tolerant-size-ratio?: number low-space-ratio?: number high-space-ratio?: number + scheduler-max-waiting-operator?: integer disable-raft-learner?: boolean disable-remove-down-replica?: boolean disable-replace-offline-replica?: boolean @@ -663,6 +667,41 @@ types: 500: description: PD server failed to proceed the request. + /limit: + description: The balance rate limit for all stores. + get: + description: Get all stores' balance rate limit. + responses: + 200: + body: + application/json: + type: string + 500: + description: PD server failed to proceed the request. + post: + description: Set all stores' balance rate limit. + body: + application/json: + description: key-value pair. + type: object + responses: + 200: + description: All stores' balance rate limits are updated. + 400: + description: The input is invalid. + 500: + description: PD server failed to proceed the request. + + /remove-tombstone: + description: Remove all tombstone stores. + delete: + description: Remove all tombstone stores. + responses: + 200: + description: All tombstone stores are removed. + 500: + description: PD server failed to proceed the request. + /store/{storeId}: description: A specific store. uriParameters: @@ -696,7 +735,7 @@ types: description: PD server failed to proceed the request. /state: - description: The specific store's state. + description: The state for the specific store. post: description: Set the store's state. queryParameters: @@ -714,7 +753,7 @@ types: description: PD server failed to proceed the request. /label: - description: The specific store's label. + description: The label for the specific store. post: description: Set the store's label. body: @@ -730,7 +769,7 @@ types: description: PD server failed to proceed the request. /weight: - description: The specific store's weight. + description: The weight for the specific store. post: description: Set the store's leader/region weight. body: @@ -746,6 +785,22 @@ types: 500: description: PD server failed to proceed the request. + /limit: + description: The balance rate limit for the specific store. + post: + description: Set the store's balance rate limit. + body: + application/json: + description: key-value pair. + type: object + responses: + 200: + description: The store's balance rate limit is updated. + 400: + description: The input is invalid. + 500: + description: PD server failed to proceed the request. + /labels: description: The store label values in the cluster. get: diff --git a/server/schedule/selector.go b/server/schedule/selector.go index 721ec425f222..0530f106c13a 100644 --- a/server/schedule/selector.go +++ b/server/schedule/selector.go @@ -34,7 +34,7 @@ func NewBalanceSelector(kind core.ResourceKind, filters []Filter) *BalanceSelect } } -// SelectSource selects the store that can pass all filters and has the minimal +// SelectSource selects the store that can pass all filters and has the maximal // resource score. func (s *BalanceSelector) SelectSource(opt Options, stores []*core.StoreInfo) *core.StoreInfo { var result *core.StoreInfo @@ -51,7 +51,7 @@ func (s *BalanceSelector) SelectSource(opt Options, stores []*core.StoreInfo) *c return result } -// SelectTarget selects the store that can pass all filters and has the maximal +// SelectTarget selects the store that can pass all filters and has the minimal // resource score. func (s *BalanceSelector) SelectTarget(opt Options, stores []*core.StoreInfo, filters ...Filter) *core.StoreInfo { filters = append(filters, s.filters...) diff --git a/server/schedulers/balance_region.go b/server/schedulers/balance_region.go index aa53feb37e95..cc9057911ad2 100644 --- a/server/schedulers/balance_region.go +++ b/server/schedulers/balance_region.go @@ -207,12 +207,29 @@ func (s *balanceRegionScheduler) hasPotentialTarget(cluster schedule.Cluster, re for _, store := range cluster.GetStores() { if schedule.FilterTarget(cluster, store, filters) { + log.Debug("skip target store by filters", + zap.String("scheduler", s.GetName()), + zap.Uint64("region", region.GetID()), + zap.Uint64("source", source.GetID()), + zap.Uint64("target", store.GetID())) continue } if !store.IsUp() || store.DownTime() > cluster.GetMaxStoreDownTime() { + log.Debug("skip target store by status", + zap.String("scheduler", s.GetName()), + zap.Uint64("region", region.GetID()), + zap.Uint64("source", source.GetID()), + zap.Uint64("target", store.GetID()), + zap.Bool("isup", store.IsUp()), + zap.Duration("downtime", store.DownTime())) continue } if !shouldBalance(cluster, source, store, region, core.RegionKind, opInfluence) { + log.Debug("skip target store for it should not balance", + zap.String("scheduler", s.GetName()), + zap.Uint64("region", region.GetID()), + zap.Uint64("source", source.GetID()), + zap.Uint64("target", store.GetID())) continue } return true diff --git a/tools/pd-ctl/README.md b/tools/pd-ctl/README.md index 255e608109bd..8612dddbfa08 100644 --- a/tools/pd-ctl/README.md +++ b/tools/pd-ctl/README.md @@ -120,8 +120,9 @@ Usage: "max-store-down-time": "30m0s", "merge-schedule-limit": 8, "patrol-region-interval": "100ms", - "region-schedule-limit": 4, - "replica-schedule-limit": 8, + "region-schedule-limit": 64, + "replica-schedule-limit": 64, + "scheduler-max-waiting-operator": 3, "schedulers-v2": [ { "args": null, @@ -146,7 +147,7 @@ Usage: ], "split-merge-interval": "1h0m0s", "store-balance-rate": 1, - "tolerant-size-ratio": 5 + "tolerant-size-ratio": 0 } } >> config show all // Display all config information @@ -169,7 +170,7 @@ Usage: - `max-snapshot-count` controls the maximum number of snapshots that a single store receives or sends out at the same time. The scheduler is restricted by this configuration to avoid taking up normal application resources. When you need to improve the speed of adding replicas or balancing, increase this value. ```bash - >> config set max-snapshort-count 16 // Set the maximum number of snapshots to 16 + >> config set max-snapshot-count 16 // Set the maximum number of snapshots to 16 ``` - `max-pending-peer-count` controls the maximum number of pending peers in a single store. The scheduler is restricted by this configuration to avoid producing a large number of Regions without the latest log in some nodes. When you need to improve the speed of adding replicas or balancing, increase this value. Setting it to 0 indicates no limit.