Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flaky test: Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs #7606

Closed
arjan-bal opened this issue Sep 10, 2024 · 5 comments · Fixed by #7662
Closed

Flaky test: Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs #7606

arjan-bal opened this issue Sep 10, 2024 · 5 comments · Fixed by #7662
Assignees
Labels
Area: Transport Includes HTTP/2 client/server and HTTP server handler transports and advanced transport features. Type: Bug Type: Testing

Comments

@arjan-bal
Copy link
Contributor

Sample failure: https://github.com/grpc/grpc-go/actions/runs/10780783629/job/29897278416?pr=7498

Logs

panic: test timed out after 7m0s
running tests:
	Test (6m40s)
	Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs (6m38s)

goroutine 2385 [running]:
testing.(*M).startAlarm.func1()
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2366 +0x265
created by time.goFunc
	/opt/hostedtoolcache/go/1.22.6/x64/src/time/sleep.go:177 +0x45

goroutine 1 [chan receive, 6 minutes]:
testing.(*T).Run(0xc000162340, {0xc548bb, 0x4}, 0xc929a0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1750 +0x851
testing.runTests.func1(0xc000162340)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2161 +0x86
testing.tRunner(0xc000162340, 0xc000e61b10)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1689 +0x21f
testing.runTests(0xc00019e228, {0x10ea020, 0x4, 0x4}, {0xc00026dbb8?, 0xc00026dc00?, 0x10f3200?})
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2159 +0x8bf
testing.(*M).Run(0xc0001a4fa0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:2027 +0xf18
main.main()
	_testmain.go:57 +0x2be

goroutine 2285 [chan receive, 6 minutes]:
testing.(*T).Run(0xc0001624e0, {0xb8ebc2, 0x2b}, 0xc0000444a0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1750 +0x851
google.golang.org/grpc/internal/grpctest.RunSubTests(0xc0001624e0, {0xd4aba0, 0x1154000})
	/home/runner/work/grpc-go/grpc-go/internal/grpctest/grpctest.go:114 +0x3d8
google.golang.org/grpc/internal/transport.Test(0xc0001624e0)
	/home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:60 +0x35
testing.tRunner(0xc0001624e0, 0xc929a0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1689 +0x21f
created by testing.(*T).Run in goroutine 1
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1742 +0x826

goroutine 2367 [sync.Mutex.Lock, 6 minutes]:
sync.runtime_SemacquireMutex(0x72ca2d?, 0x0?, 0xc00020aa00?)
	/opt/hostedtoolcache/go/1.22.6/x64/src/runtime/sema.go:77 +0x25
sync.(*Mutex).lockSlow(0xc000202870)
	/opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:171 +0x213
sync.(*Mutex).Lock(0xc000202870)
	/opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:90 +0x55
google.golang.org/grpc/internal/transport.(*http2Client).GetGoAwayReason(0xc0002026c8)
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:1418 +0x55
google.golang.org/grpc/internal/transport.(*http2Client).Close(0xc0002026c8, {0xd49120, 0xc00003a9f0})
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:1031 +0x7b2
google.golang.org/grpc/internal/transport.s.TestClientCloseReturnsEarlyWhenGoAwayWriteHangs({{}}, 0xc0001e0680)
	/home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:2846 +0x7e7
google.golang.org/grpc/internal/grpctest.RunSubTests.func1(0xc0001e0680)
	/home/runner/work/grpc-go/grpc-go/internal/grpctest/grpctest.go:122 +0x10e
testing.tRunner(0xc0001e0680, 0xc0000444a0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1689 +0x21f
created by testing.(*T).Run in goroutine 2285
	/opt/hostedtoolcache/go/1.22.6/x64/src/testing/testing.go:1742 +0x826

goroutine 2368 [IO wait, 6 minutes]:
internal/poll.runtime_pollWait(0x7fe83f328c80, 0x72)
	/opt/hostedtoolcache/go/1.22.6/x64/src/runtime/netpoll.go:345 +0x85
internal/poll.(*pollDesc).wait(0xc00020a120, 0x72, 0x0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/internal/poll/fd_poll_runtime.go:84 +0xb1
internal/poll.(*pollDesc).waitRead(...)
	/opt/hostedtoolcache/go/1.22.6/x64/src/internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0xc00020a100)
	/opt/hostedtoolcache/go/1.22.6/x64/src/internal/poll/fd_unix.go:611 +0x507
net.(*netFD).accept(0xc00020a100)
	/opt/hostedtoolcache/go/1.22.6/x64/src/net/fd_unix.go:172 +0x45
net.(*TCPListener).accept(0xc0000446c0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/net/tcpsock_posix.go:159 +0x3e
net.(*TCPListener).Accept(0xc0000446c0)
	/opt/hostedtoolcache/go/1.22.6/x64/src/net/tcpsock.go:327 +0x65
google.golang.org/grpc/internal/transport.(*server).start(0xc0000242d0, 0xc0001e0680, 0x0, 0xc000000[180](https://github.com/grpc/grpc-go/actions/runs/10780783629/job/29897278416?pr=7498#step:8:181), 0x0)
	/home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:362 +0x522
created by google.golang.org/grpc/internal/transport.setUpServerOnly in goroutine 2367
	/home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:453 +0x325

goroutine 2392 [sync.Mutex.Lock, 6 minutes]:
sync.runtime_SemacquireMutex(0x4b6229?, 0x78?, 0xb66c05?)
	/opt/hostedtoolcache/go/1.22.6/x64/src/runtime/sema.go:77 +0x25
sync.(*Mutex).lockSlow(0xc000202870)
	/opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:171 +0x213
sync.(*Mutex).Lock(0xc000202870)
	/opt/hostedtoolcache/go/1.22.6/x64/src/sync/mutex.go:90 +0x55
google.golang.org/grpc/internal/transport.(*http2Client).Close(0xc0002026c8, {0xd491c0, 0xc0000b8de0})
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:989 +0xe5
google.golang.org/grpc/internal/transport.(*http2Client).reader(0xc0002026c8, 0xc000408f00)
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:1672 +0xcf5
created by google.golang.org/grpc/internal/transport.newHTTP2Client in goroutine 2367
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:412 +0x33ab

goroutine 2393 [chan receive, 6 minutes]:
google.golang.org/grpc/internal/transport.(*hangingConn).Write(0xc000044780, {0xc000124040, 0x2a, 0x40})
	/home/runner/work/grpc-go/grpc-go/internal/transport/transport_test.go:2796 +0xb5
google.golang.org/grpc/internal/transport.(*bufWriter).Write(0xc000024aa0, {0xc000124040, 0x2a, 0x40})
	/home/runner/work/grpc-go/grpc-go/internal/transport/http_util.go:325 +0x1da
golang.org/x/net/http2.(*Framer).endWrite(0xc00015e540)
	/home/runner/go/pkg/mod/golang.org/x/net@v0.28.0/http2/frame.go:371 +0x19d
golang.org/x/net/http2.(*Framer).WriteGoAway(0xc00015e540, 0x1, 0x0, {0xc0001d4380, 0x19, 0x47928e?})
	/home/runner/go/pkg/mod/golang.org/x/net@v0.28.0/http2/frame.go:924 +0x599
google.golang.org/grpc/internal/transport.(*http2Client).outgoingGoAwayHandler(0xc000[202](https://github.com/grpc/grpc-go/actions/runs/10780783629/job/29897278416?pr=7498#step:8:203)6c8, 0xc00017e440)
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:531 +0x177
google.golang.org/grpc/internal/transport.(*loopyWriter).goAwayHandler(0xc00024c180, 0xc00017e440)
	/home/runner/work/grpc-go/grpc-go/internal/transport/controlbuf.go:852 +0x67
google.golang.org/grpc/internal/transport.(*loopyWriter).handle(0xc00024c180, {0xbbf300, 0xc00017e440})
	/home/runner/work/grpc-go/grpc-go/internal/transport/controlbuf.go:886 +0x7dc
google.golang.org/grpc/internal/transport.(*loopyWriter).run(0xc00024c180)
	/home/runner/work/grpc-go/grpc-go/internal/transport/controlbuf.go:579 +0x17c
google.golang.org/grpc/internal/transport.newHTTP2Client.func6()
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:470 +0x250
created by google.golang.org/grpc/internal/transport.newHTTP2Client in goroutine 2367
	/home/runner/work/grpc-go/grpc-go/internal/transport/http2_client.go:468 +0x3df4
FAIL	google.golang.org/grpc/internal/transport	420.030s
@arjan-bal arjan-bal added Type: Bug Type: Testing Area: Testing Includes tests and testing utilities that we have for unit and e2e tests within our repo. labels Sep 10, 2024
@purnesh42H
Copy link
Contributor

should area be transport?

@aranjans aranjans added Area: Transport Includes HTTP/2 client/server and HTTP server handler transports and advanced transport features. and removed Area: Testing Includes tests and testing utilities that we have for unit and e2e tests within our repo. labels Sep 17, 2024
@aranjans
Copy link
Contributor

yes, this should be transport. Updated.

@eshitachandwani
Copy link
Member

@aranjans
Copy link
Contributor

RCA: #7371 PR introduced a timeout logic to loopyWriter to exit but t.Close uses GetGoAwayReason to fetch the last goAway and it obtains t.mu.Lock(). Test/ClientCloseReturnsEarlyWhenGoAwayWriteHangs overrides the http/2 Write method and blocks forever, so in case t.Close() is called and we sent out goAwayFrame here and this gets executed before we write the goAway frame over the wire here, this forms a deadlock.
Hence, in order to fix it we can fetch the goAwayReason first and then put the goAway frame into cbuf so to avoid such deadlock conditions.

We have this PR in review for the same.

@easwars
Copy link
Contributor

easwars commented Oct 2, 2024

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Area: Transport Includes HTTP/2 client/server and HTTP server handler transports and advanced transport features. Type: Bug Type: Testing
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants