8000 Raft cluster node identity conflict issue · Issue #643 · hashicorp/raft · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Raft cluster node identity conflict issue #643

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
YouZhengChuan opened this issue May 14, 2025 · 0 comments
Open

Raft cluster node identity conflict issue #643

YouZhengChuan opened this issue May 14, 2025 · 0 comments

Comments

@YouZhengChuan
Copy link

I implemented a raft cluster function using golang through the module "github.com/hashicorp/raft" and found a problem in the following scenario:

There are currently 2 raft clusters, the cluster names, cluster nodes and IP addresses are as follows: (Raft clusters are initialized through the BootstrapCluster method)

Cluster1 BootstrapCluster servers:

  • node1: {raft.ServerID: c1-node1, raft.ServerAddress: 192.168.100.1:7000}
  • node2: {raft.ServerID: c1-node2, raft.ServerAddress: 192.168.100.2:7000}
  • node3: {raft.ServerID: c1-node3, raft.ServerAddress: 192.168.100.3:7000}

Cluster2 BootstrapCluster servers:

  • node3: {raft.ServerID: c2-node3, raft.ServerAddress: 192.168.100.3:7000}
  • node4: {raft.ServerID: c2-node4, raft.ServerAddress: 192.168.100.4:7000}
  • node5: {raft.ServerID: c2-node5, raft.ServerAddress: 192.168.100.5:7000}

Among them, "node1" and "node2" are started according to "Cluster1":

sudo ./raft_svr -cluster 'c1-node1,127.0.0.1,800;c1-node2,127.0.0.2,800;c1-node3,127.0.0.3,800' -id c1-node1

sudo ./raft_svr -cluster 'c1-node1,127.0.0.1,800;c1-node2,127.0.0.2,800;c1-node3,127.0.0.3,800' -id c1-node2

"node3","node4","node5" first start according to "Cluster2":

sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node3

sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node4

sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node5

Then you will find that "node3" will switch back and forth between "Cluster1" and "Cluster2", sometimes belonging to "Cluster1" and sometimes belonging to "Cluster2".

INFO[0170] current state:Follower, servers:[{Suffrage:Voter ID:c2-node3 Address:127.0.0.3:800} {Suffrage:Voter ID:c2-node4 Address:127.0.0.4:800} {Suffrage:Voter ID:c2-node5 Address:127.0.0.5:800}], leader address:127.0.0.5:800, last contact:2025-05-14 15:35:53.330867 +0800 CST m=+169.779019126
INFO[0171] current state:Follower, servers:[{Suffrage:Voter ID:c2-node3 Address:127.0.0.3:800} {Suffrage:Voter ID:c2-node4 Address:127.0.0.4:800} {Suffrage:Voter ID:c2-node5 Address:127.0.0.5:800}], leader address:127.0.0.1:800, last contact:2025-05-14 15:35:54.308388 +0800 CST m=+170.756576126

Is this situation expected?

here is my code:

package main

import (
	"flag"
	"fmt"
	"io"
	"net"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/hashicorp/raft" // github.com/hashicorp/raft v1.7.3
	log "github.com/sirupsen/logrus"
)

type raftCluster struct {
	localRaftID     raft.ServerID
	servers         map[raft.ServerID]raft.ServerAddress // raftID : raftAddressPort
	raft            *raft.Raft
	electionTimeout time.Duration
}

func (r *raftCluster) Start() error {
	config := raft.DefaultConfig()
	config.HeartbeatTimeout = 2000 * time.Millisecond
	config.ElectionTimeout = 5000 * time.Millisecond
	config.CommitTimeout = 2000 * time.Millisecond
	config.LeaderLeaseTimeout = 2000 * time.Millisecond
	config.LocalID = r.localRaftID
	config.LogOutput = log.StandardLogger().Out

	r.electionTimeout = config.ElectionTimeout * time.Duration(len(r.servers)*2)

	localAddressPort := string(r.servers[r.localRaftID])
	tcpAddr, err := net.ResolveTCPAddr("tcp", localAddressPort)
	if err != nil {
		return fmt.Errorf("resolve tcp address %s, %v", localAddressPort, err)
	}
	transport, err := raft.NewTCPTransport(localAddressPort, tcpAddr, 2, 10*time.Second, log.StandardLogger().Out)
	if err != nil {
		return fmt.Errorf("fail to create tcp transport, localAddressPort:%s, tcpAddr:%v, %v",
			localAddressPort, tcpAddr, err)
	}
	snapshots := raft.NewInmemSnapshotStore()
	logStore := raft.NewInmemStore()
	stableStore := raft.NewInmemStore()
	fm := NewFsm()
	r.raft, err = raft.NewRaft(config, fm, logStore, stableStore, snapshots, transport)
	if err != nil {
		return fmt.Errorf("create raft error, %v", err)
	}

	var configuration raft.Configuration
	for sID, addr := range r.servers {
		server := raft.Server{
			ID:      sID,
			Address: addr,
		}
		configuration.Servers = append(configuration.Servers, server)
	}
	err = r.raft.BootstrapCluster(configuration).Error()
	if err != nil {
		return fmt.Errorf("raft bootstrap faild, conf:%v, %v", configuration, err)
	}
	log.Infof("bootstrap cluster as config: %v", configuration)

	return nil
}

func (r *raftCluster) checkLeaderState() {
	ticker := time.NewTicker(time.Second)
	for {
		select {
		case leader := <-r.raft.LeaderCh():
			log.Infof("im leader:%v, state:%s, leader address:%s", leader, r.raft.State(), r.raft.Leader())

		case <-ticker.C:
			verifyErr := r.raft.VerifyLeader().Error()
			servers := r.raft.GetConfiguration().Configuration().Servers
			switch verifyErr {
			case nil:
				log.Infof("im leader, servers:%v", servers)
			case raft.ErrNotLeader:
				// check cluster leader
				log.Infof("current state:%v, servers:%+v, leader address:%v, last contact:%v",
					r.raft.State(), servers, r.raft.Leader(), r.raft.LastContact())
			}
		}
	}
}

func main() {
	var (
		clusters = flag.String("cluster", "",
			"cluster node address, fmt: ID,IP,Port;ID,IP,Port")
		clusterId = flag.String("id", "", "cluster id")
	)
	flag.Parse()

	if *clusterId == "" {
		log.Infof("cluster id messing")
		os.Exit(1)
	}

	servers := make(map[raft.ServerID]raft.ServerAddress)
	for _, cluster := range strings.Split(*clusters, ";") {
		info := strings.Split(cluster
71CE
, ",")
		var (
			nid   string
			nip   net.IP
			nport int
			err   error
		)
		switch {
		case len(info) == 3:
			nid = info[0]
			nip = net.ParseIP(info[1])
			if nip == nil {
				log.Infof("cluster %s ip %s parse failed", cluster, info[1])
				os.Exit(1)
			}
			nport, err = strconv.Atoi(info[2])
			if err != nil {
				log.Infof("cluster %s port %s parse failed, %v", cluster, info[2], err)
			}
		default:
			log.Infof("cluster args value is bad format")
			os.Exit(1)
		}
		log.Infof("cluster node id:%s, ip:%v, port:%d", nid, nip, nport)
		addr := net.TCPAddr{IP: nip, Port: nport}
		servers[raft.ServerID(nid)] = raft.ServerAddress(addr.String())
	}

	r := raftCluster{
		localRaftID: raft.ServerID(*clusterId),
		servers:     servers,
	}
	err := r.Start()
	if err != nil {
		log.Infof("rafter cluster start failed, %v", err)
		os.Exit(1)
	}
	r.checkLeaderState()
}

// SimpleFsm: 实现一个简单的Fsm

type SimpleFsm struct {
	db database
}

func NewFsm() *SimpleFsm {
	fsm := &SimpleFsm{
		db: NewDatabase(),
	}
	return fsm
}

func (f *SimpleFsm) Apply(l *raft.Log) interface{} {
	return nil
}

func (f *SimpleFsm) Snapshot() (raft.FSMSnapshot, error) {
	return &f.db, nil
}

func (f *SimpleFsm) Restore(io.ReadCloser) error {
	return nil
}

type database struct{}

func NewDatabase() database {
	return database{}
}

func (d *database) Get(key string) string {
	return "not implemented"
}

func (d *database) Set(key, value string) {}

func (d *database) Persist(sink raft.SnapshotSink) error {
	_, _ = sink.Write([]byte{})
	_ = sink.Close()
	return nil
}

func (d *database) Release() {}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant
0