8000 fleetd: keep a list of listeners across reconfiguring fleetd by dongsupark · Pull Request #1517 · coreos/fleet · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
This repository was archived by the owner on Jan 30, 2020. It is now read-only.

fleetd: keep a list of listeners across reconfiguring fleetd #1517

Merged
merged 4 commits into from
Apr 14, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ type Server struct {
cur http.Handler
}

func (s *Server) GetListeners() []net.Listener {
return s.listeners
}

func (s *Server) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
s.cur.ServeHTTP(rw, req)
}
Expand All @@ -48,7 +52,7 @@ func (s *Server) Serve() {
go func() {
err := http.Serve(l, s)
if err != nil {
log.Errorf("Failed serving HTTP on listener: %v", l.Addr())
log.Errorf("Failed serving HTTP on listener: addr: %v, err: %v", l.Addr(), err)
}
}()
}
Expand Down
13 changes: 11 additions & 2 deletions fleetd/fleetd.go
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func main() {
}

log.Debugf("Creating Server")
srv, err := server.New(*cfg)
srv, err := server.New(*cfg, nil)
if err != nil {
log.Fatalf("Failed creating Server: %v", err.Error())
}
Expand All @@ -119,13 +119,22 @@ func main() {
}

log.Infof("Restarting server components")
srv.SetReconfigServer(true)

// Get Server.listeners[] to keep it for a new server,
// before killing the old server.
oldListeners := srv.GetApiServerListeners()

srv.Kill()

srv, err = server.New(*cfg)
// The new server takes the original listeners.
srv, err = server.New(*cfg, oldListeners)
if err != nil {
log.Fatalf(err.Error())
}

srv.Run()
srv.SetReconfigServer(false)
}

shutdown := func() {
Expand Down
133 changes: 133 additions & 0 deletions functional/server_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2016 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package functional

import (
"fmt"
"strings"
"testing"

"github.com/coreos/fleet/functional/platform"
"github.com/coreos/fleet/functional/util"
)

// TestReconfigureServer checks whether fleetd managed to keep its listeners
// across reconfiguration of fleetd after receiving SIGHUP.
func TestReconfigureServer(t *testing.T) {
cluster, err := platform.NewNspawnCluster("smoke")
if err != nil {
t.Fatal(err)
}
defer cluster.Destroy()

m0, err := cluster.CreateMember()
if err != nil {
t.Fatal(err)
}
_, err = cluster.WaitForNMachines(m0, 1)
if err != nil {
t.Fatal(err)
}

// NOTE: we need to sleep once here to get reliable results.
// Without this sleep, the entire fleetd test always ends up succeeding
// no matter whether SIGHUP came or not.
_, _ = cluster.MemberCommand(m0, "sh", "-c", `'sleep 2'`)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, would it help after SIGHUP is sent to check the journal if "fleet.Reloading configuration." is really there ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AIUI the problem is actually that we need to make sure fleetd is fully initialised before even sending the SIGHUP. I agree though that it would be better to wait for this by checking the logs. (Unless there is some more elegant method for checking fleetd's exact status?...)

Note that a proper check like that will also make the preceding checks using systemctl unnecessary.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tixxdz

Hmm, would it help after SIGHUP is sent to check the journal if "fleet.Reloading configuration." is really there ?

Yes, I updated it like that.
But that change didn't allow me to remove th 8000 e sleep command before sending SIGHUP. Without the sleep, test always ends up succeeding, no matter what.. :-(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongsupark actually, I just realised that WaitForNMachines() will only succeed once fleet is fully up? So the systemctl checks should definitely be redundant.

Also, I still don't understand why the sleep is necessary... But I guess that's not a showstopper.

err = waitForFleetdSocket(cluster, m0)
if err != nil {
t.Fatalf("Failed to get a list of fleetd sockets: %v", err)
}

// send a SIGHUP to fleetd, and periodically checks if a message
// "Reloading configuration" appears in fleet's journal, up to timeout (15) seconds.
stdout, _ := cluster.MemberCommand(m0, "sudo", "systemctl", "kill", "-s", "SIGHUP", "fleet")
if strings.TrimSpace(stdout) != "" {
t.Fatalf("Sending SIGHUP to fleetd returned: %s", stdout)
}

err = waitForReloadConfig(cluster, m0)
if err != nil {
t.Fatalf("Failed to get log about reconfiguration: %v", err)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this? I would assume it's no longer necessary, now that we actually check for the socket connection still working?...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that you use it here now to check for fleet being up, which makes sense I guess. With this in place, do you still need the sleep above?...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that you use it here now to check for fleet being up, which makes sense I guess. With this in place, do you still need the sleep above?...

Unfortunately yes, I still need the sleep. I just did another round of tests, but it's still the same.
As mentioned above, without the sleep, the whole test always succeeds. What would be the point testing this at all, if it would always give the same answer? That's where I'm puzzled, but for me it's more important to get the test working at all.


// check if fleetd is still running correctly, by running fleetctl status
_, _, err = cluster.Fleetctl(m0, "list-units")
if err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongsu: could you please change this to:
fleet $ ./bin/fleetctl list-units; echo $? Error retrieving list of units from repository: googleapi: Error 503: fleet server unable to communicate with etcd 1
fleet $ ./bin/fleetctl status; echo $? 0
Second one succeeded.
That's my result of experimenting right now, at least it is the same as in the original reported bug

Thanks you!

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tixxdz this reveals that there's still a bug somewhere. I'll try to investigate soon.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tixxdz Fixed a bug. Checking with "fleetctl list-units" is now working.

t.Fatalf("Unable to check list-units. Please check for fleetd socket. err:%v", err)
}

// Check for HTTP listener error looking into the fleetd journal
stdout, _ = cluster.MemberCommand(m0, "journalctl _PID=$(pidof fleetd)")
if strings.Contains(strings.TrimSpace(stdout), "Failed serving HTTP on listener:") {
t.Fatalf("Fleetd log returned error on HTTP listeners: %s", stdout)
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The check seems fine, but what worries me is we are checking an error message that might disappear for other reasons! is there a better way to check for sockets in: systemctl status fleetd; grep pid, then inspect /proc/pid/* ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was just lazy when I wrote this part. I'll try to change it like that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to check sockets sounds complicated, and I'm not convinced it's really a good idea to rely on internals like that...

TBH, I don't think either of these approaches is really the right way to test this. Rather, we should try to check whether actual socket connections keep working as expected. I don't have any specific setup in mind for this -- but surely it should be possible to do this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

F438

@antrik indeed, if it's easy to check if the socket is still working in this context then go for it ? my comment was a bit generic...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather, we should try to check whether actual socket connections keep working as expected.

+1, test for the desired behaviour by testing the desired behaviour :-)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should try to check whether actual socket connections keep working as expected

Done.

// Check expected state after reconfiguring fleetd
stdout, _ = cluster.MemberCommand(m0, "systemctl", "show", "--property=ActiveState", "fleet")
if strings.TrimSpace(stdout) != "ActiveState=active" {
t.Fatalf("Fleet unit not reported as active: %s", stdout)
}
stdout, _ = cluster.MemberCommand(m0, "systemctl", "show", "--property=Result", "fleet")
if strings.TrimSpace(stdout) != "Result=success" {
t.Fatalf("Result for fleet unit not reported as success: %s", stdout)
}
}

// waitForReloadConfig returns if a message "Reloading configuration" exists
// in the journal, periodically checking for the journal up to the timeout.
func waitForReloadConfig(cluster platform.Cluster, m0 platform.Member) (err error) {
_, err = util.WaitForState(
func() bool {
// NOTE: journalctl should run just simply like "journalctl -u fleet",
// without being piped with grep. Doing
// "journalctl -u fleet | grep \"Reloading configuration\"" is racy
// in a subtle way, so that it sometimes fails only on semaphoreci.
// - dpark 20160408
stdout, _ := cluster.MemberCommand(m0, "journalctl _PID=$(pidof fleetd)")
journalfleet := strings.TrimSpace(stdout)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongsupark does it make a difference here if you do "journalctl _PID=$(pidof fleetd)" ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why pidof would be more reliable than -u? Do we expect more than one instance of fleetd to exist during the test? (And if so, is it actually appropriate to consider only the currently active one?...)

(I don't see why using grep wouldn't be reliable in the first place, though... But doing the compare in Go is fine too; so I guess this is a non-issue.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the gains of using -u where we just want what the pid logs which is more precise in this context ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and yeh I forget to say that some developers are also just lazy so they run the tests again and again on the same VM grepping the old bogus journal logs...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tixxdz there are no "old bogus journal logs" -- each test spawns fresh containers with fresh logs. And I still think the -u abstraction offered to us by journalctl is more elegant than dealing with low-level details such as PIDs ourselves...

(PIDs are in fact prone to race conditions in general -- though I guess that's not really an issue in this specific case.)

if !strings.Contains(journalfleet, "Reloading configuration") {
fmt.Errorf("Fleetd is not fully reconfigured, retrying... entire fleet journal:\n%v", journalfleet)
return false
}
return true
},
)
if err != nil {
return fmt.Errorf("Reloading configuration log not found: %v", err)
}

return nil
}

// waitForFleetdSocket returns if /var/run/fleet.sock exists, periodically
// checking for states.
func waitForFleetdSocket(cluster platform.Cluster, m0 platform.Member) (err error) {
_, err = util.WaitForState(
func() bool {
stdout, _ := cluster.MemberCommand(m0, "test -S /var/run/fleet.sock && echo 1")
if strings.TrimSpace(stdout) == "" {
fmt.Errorf("Fleetd is not fully started, retrying...")
return false
}
return true
},
)
if err != nil {
return fmt.Errorf("Fleetd socket not found: %v", err)
}

return nil
}
45 changes: 30 additions & 15 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package server
import (
"encoding/json"
"errors"
"net"
"net/http"
"sync"
"time"
Expand Down Expand Up @@ -48,16 +49,17 @@ const (
)

type Server struct {
agent *agent.Agent
aReconciler *agent.AgentReconciler
usPub *agent.UnitStatePublisher
usGen *unit.UnitStateGenerator
engine *engine.Engine
mach *machine.CoreOSMachine
hrt heart.Heart
mon *Monitor
api *api.Server
disableEngine bool
agent *agent.Agent
aReconciler *agent.AgentReconciler
usPub *agent.UnitStatePublisher
usGen *unit.UnitStateGenerator
engine *engine.Engine
mach *machine.CoreOSMachine
hrt heart.Heart
mon *Monitor
api *api.Server
disableEngine bool
reconfigServer bool

engineReconcileInterval time.Duration

Expand All @@ -66,7 +68,7 @@ type Server struct {
wg sync.WaitGroup // used to co-ordinate shutdown
}

func New(cfg config.Config) (*Server, error) {
func New(cfg config.Config, listeners []net.Listener) (*Server, error) {
agentTTL, err := time.ParseDuration(cfg.AgentTTL)
if err != nil {
return nil, err
Expand Down Expand Up @@ -115,9 +117,11 @@ func New(cfg config.Config) (*Server, error) {

e := engine.New(reg, lManager, rStream, mach)

listeners, err := activation.Listeners(false)
if err != nil {
return nil, err
if len(listeners) == 0 {
listeners, err = activation.Listeners(false)
if err != nil {
return nil, err
}
}

hrt := heart.New(reg, mach)
Expand All @@ -142,6 +146,7 @@ func New(cfg config.Config) (*Server, error) {
stopc: nil,
engineReconcileInterval: eIval,
disableEngine: cfg.DisableEngine,
reconfigServer: false,
}

return &srv, nil
Expand Down Expand Up @@ -239,7 +244,9 @@ func (s *Server) Supervise() {

// Kill is used to gracefully terminate the server by triggering the Monitor to shut down
func (s *Server) Kill() {
close(s.killc)
if !s.reconfigServer {
close(s.killc)
}
}

func (s *Server) Purge() {
Expand All @@ -260,3 +267,11 @@ func (s *Server) MarshalJSON() ([]byte, error) {
UnitStateGenerator: s.usGen,
})
}

func (s *Server) GetApiServerListeners() []net.Listener {
return s.api.GetListeners()
}

func (s *Server) SetReconfigServer(isReconfigServer bool) {
s.reconfigServer = isReconfigServer
}
0