11use crate :: cloud_init:: { CloudInitDisk , GUEST_USER } ;
22use crate :: process:: CpuModel as Cpu ;
33use crate :: process:: { ExpectedOutput , Machine , QemuConfig , QemuPayload , QemuProcess , RtcClock } ;
4- use crate :: tests:: full_os:: { OS_READY_PATTERN , SSH_ARGS , ssh_command} ;
4+ use crate :: tests:: full_os:: { OS_READY_PATTERN , ssh_command} ;
55use crate :: util:: { NetConfig , allocate_taps, generate_mac} ;
66use anyhow:: { Context , Result , bail, ensure} ;
77use log:: debug;
88use qapi:: qmp:: { self , RunState } ;
9- use std:: path:: { Path , PathBuf } ;
10- use std:: process:: { Child , Command , Stdio } ;
9+ use std:: io:: { BufRead , BufReader , Write } ;
10+ use std:: net:: TcpStream ;
11+ use std:: thread:: sleep;
1112use std:: time:: { Duration , Instant } ;
1213use test_macro:: test_fn;
1314
@@ -18,144 +19,59 @@ const INITRD: &str = "payload/initrd.img";
1819const OS_IMAGE : & str = "payload/os-image.qcow2" ;
1920const OS_BOOT_TIMEOUT : Duration = Duration :: from_secs ( 60 ) ;
2021const SSH_TIMEOUT : Duration = Duration :: from_secs ( 30 ) ;
21-
22- /// Persistent SSH connection using ControlMaster.
23- /// The TCP session should survive VM live migration on a bridge,
24- /// proving that network state migrated correctly.
25- struct SshSession {
26- child : Child ,
27- control_path : PathBuf ,
28- key_path : PathBuf ,
29- host : String ,
30- user : String ,
22+ const ECHO_PORT : u16 = 7777 ;
23+ const ECHO_SERVER_CMD : & str = concat ! (
24+ "nohup python3 -c '" ,
25+ "import socket; " ,
26+ "s=socket.socket(); " ,
27+ "s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1); " ,
28+ "s.bind((\" 0.0.0.0\" ,7777)); " ,
29+ "s.listen(1); " ,
30+ "c,_=s.accept(); " ,
31+ "[c.sendall(d) for d in iter(lambda:c.recv(4096),b\" \" )]" ,
32+ "' </dev/null >/dev/null 2>&1 &" ,
33+ ) ;
34+
35+ /// Send a line over a TcpStream and read the echoed response.
36+ fn echo_roundtrip ( stream : & mut TcpStream , msg : & str , timeout : Duration ) -> Result < String > {
37+ stream
38+ . set_write_timeout ( Some ( timeout) )
39+ . context ( "set_write_timeout" ) ?;
40+ stream
41+ . set_read_timeout ( Some ( timeout) )
42+ . context ( "set_read_timeout" ) ?;
43+ writeln ! ( stream, "{msg}" ) . context ( "echo write" ) ?;
44+ stream. flush ( ) . context ( "echo flush" ) ?;
45+ let mut reader = BufReader :: new ( stream. try_clone ( ) . context ( "clone stream" ) ?) ;
46+ let mut line = String :: new ( ) ;
47+ reader. read_line ( & mut line) . context ( "echo read" ) ?;
48+ Ok ( line. trim ( ) . to_string ( ) )
3149}
3250
33- impl SshSession {
34- /// Open a ControlMaster connection. Call only after SSH is known to be
35- /// reachable (e.g. after a successful `ssh_command`).
36- fn open (
37- key_path : & Path ,
38- host : & str ,
39- port : u16 ,
40- user : & str ,
41- dir : & Path ,
42- timeout : Duration ,
43- ) -> Result < Self > {
44- let control_path = dir. join ( "ssh_ctl" ) ;
45- let key_str = key_path. to_string_lossy ( ) . to_string ( ) ;
46- let ctl_str = format ! ( "ControlPath={}" , control_path. display( ) ) ;
47- let port_str = port. to_string ( ) ;
48- let user_host = format ! ( "{user}@{host}" ) ;
49- let var_args: Vec < & str > = vec ! [
50- "-i" ,
51- & key_str,
52- "-o" ,
53- "ControlMaster=yes" ,
54- "-o" ,
55- & ctl_str,
56- "-o" ,
57- "ControlPersist=yes" ,
58- "-p" ,
59- & port_str,
60- "-N" ,
61- & user_host,
62- ] ;
63- let mut args = SSH_ARGS . to_vec ( ) ;
64- args. extend_from_slice ( & var_args) ;
65-
66- let mut child = Command :: new ( "ssh" )
67- . args ( & args)
68- . stdin ( Stdio :: null ( ) )
69- . stdout ( Stdio :: null ( ) )
70- . stderr ( Stdio :: piped ( ) )
71- . spawn ( )
72- . context ( "failed to start SSH ControlMaster" ) ?;
73-
74- let start = Instant :: now ( ) ;
75- while !control_path. exists ( ) {
76- if start. elapsed ( ) > timeout {
77- let _ = child. kill ( ) ;
78- let _ = child. wait ( ) ;
79- bail ! ( "SSH ControlMaster socket did not appear within {timeout:?}" ) ;
80- }
81- if let Some ( status) = child. try_wait ( ) . context ( "failed to check ssh process" ) ? {
82- let stderr = child
83- . stderr
84- . take ( )
85- . map ( |s| std:: io:: read_to_string ( s) . unwrap_or_default ( ) )
86- . unwrap_or_default ( ) ;
87- bail ! ( "SSH ControlMaster exited early ({status}): {stderr}" ) ;
51+ /// Connect to the guest echo server, retrying until `timeout`.
52+ fn connect_echo ( host : & str , port : u16 , timeout : Duration ) -> Result < TcpStream > {
53+ let start = Instant :: now ( ) ;
54+ loop {
55+ sleep ( Duration :: from_millis ( 200 ) ) ;
56+ if crate :: SHUTDOWN . load ( std:: sync:: atomic:: Ordering :: Relaxed ) {
57+ bail ! ( "interrupted" ) ;
58+ }
59+ match TcpStream :: connect_timeout (
60+ & format ! ( "{host}:{port}" ) . parse ( ) . context ( "parse addr" ) ?,
61+ Duration :: from_secs ( 2 ) ,
62+ ) {
63+ Ok ( stream) => return Ok ( stream) ,
64+ Err ( e) => {
65+ if start. elapsed ( ) > timeout {
66+ bail ! ( "echo server not reachable after {timeout:?}: {e}" ) ;
67+ }
68+ debug ! ( "echo connect failed ({e}), retrying..." ) ;
69+ std:: thread:: sleep ( Duration :: from_secs ( 1 ) ) ;
8870 }
89- std:: thread:: sleep ( Duration :: from_millis ( 200 ) ) ;
9071 }
91- debug ! (
92- "SSH ControlMaster established at {}" ,
93- control_path. display( )
94- ) ;
95-
96- Ok ( Self {
97- child,
98- control_path,
99- key_path : key_path. to_path_buf ( ) ,
100- host : host. to_string ( ) ,
101- user : user. to_string ( ) ,
102- } )
103- }
104-
105- /// Run a command through the persistent connection.
106- fn run ( & self , command : & str ) -> Result < String > {
107- let var_args = [
108- "-i" ,
109- & self . key_path . to_string_lossy ( ) ,
110- "-o" ,
111- & format ! ( "ControlPath={}" , self . control_path. display( ) ) ,
112- & format ! ( "{}@{}" , self . user, self . host) ,
113- command,
114- ] ;
115- let mut args = SSH_ARGS . to_vec ( ) ;
116- args. extend_from_slice ( & var_args) ;
117-
118- let output = Command :: new ( "ssh" )
119- . args ( args)
120- . output ( )
121- . context ( "failed to run ssh command via control socket" ) ?;
122-
123- ensure ! (
124- output. status. success( ) ,
125- "SSH command failed: {}" ,
126- String :: from_utf8_lossy( & output. stderr)
127- ) ;
128- let stdout = String :: from_utf8_lossy ( & output. stdout ) . trim ( ) . to_string ( ) ;
129- debug ! ( "ssh (ctl) output: {stdout}" ) ;
130- Ok ( stdout)
131- }
132- }
133-
134- impl Drop for SshSession {
135- fn drop ( & mut self ) {
136- debug ! ( "closing SSH ControlMaster" ) ;
137- let _ = Command :: new ( "ssh" )
138- . args ( [
139- "-o" ,
140- & format ! ( "ControlPath={}" , self . control_path. display( ) ) ,
141- "-O" ,
142- "exit" ,
143- "dummy" ,
144- ] )
145- . output ( ) ;
146- let _ = self . child . wait ( ) ;
14772 }
14873}
14974
150- /// Parse the first field of `/proc/uptime` (seconds since boot).
151- fn parse_uptime ( raw : & str ) -> Result < f64 > {
152- raw. split_whitespace ( )
153- . next ( )
154- . context ( "empty uptime output" ) ?
155- . parse ( )
156- . context ( "failed to parse uptime" )
157- }
158-
15975fn do_migration (
16076 src : & mut QemuProcess ,
16177 dst : & mut QemuProcess ,
@@ -296,7 +212,7 @@ pub(crate) fn test_live_migration_os(machine: Machine, smp: u8) -> Result<()> {
296212 . context ( "source VM did not boot" ) ?;
297213 debug ! ( "source VM booted" ) ;
298214
299- // Verify SSH on source (with retries until guest network is ready)
215+ // Wait for SSH to become available
300216 ssh_command (
301217 & ci. ssh_key_path ,
302218 taps. guest_host ( ) ,
@@ -308,20 +224,30 @@ pub(crate) fn test_live_migration_os(machine: Machine, smp: u8) -> Result<()> {
308224 . context ( "SSH not reachable on source" ) ?;
309225 debug ! ( "source SSH is reachable" ) ;
310226
311- // Open persistent SSH connection (SSH is known-reachable at this point )
312- let session = SshSession :: open (
227+ // Start a TCP echo server in the guest using python3 (always available )
228+ ssh_command (
313229 & ci. ssh_key_path ,
314230 taps. guest_host ( ) ,
315231 22 ,
316232 GUEST_USER ,
317- mig_dir . path ( ) ,
318- Duration :: from_secs ( 10 ) ,
233+ ECHO_SERVER_CMD ,
234+ SSH_TIMEOUT ,
319235 )
320- . context ( "failed to open SSH ControlMaster" ) ?;
236+ . context ( "failed to start echo server" ) ?;
237+ debug ! ( "echo server started on guest port {ECHO_PORT}" ) ;
238+
239+ // Open a persistent TCP connection to the echo server
240+ let mut stream = connect_echo ( taps. guest_host ( ) , ECHO_PORT , SSH_TIMEOUT )
241+ . context ( "failed to connect to echo server" ) ?;
242+ debug ! ( "TCP connection established to echo server" ) ;
321243
322- // Record uptime before migration
323- let uptime_before = parse_uptime ( & session. run ( "cat /proc/uptime" ) ?) ?;
324- debug ! ( "uptime before migration: {uptime_before:.1}s" ) ;
244+ // Verify echo works before migration
245+ let reply = echo_roundtrip ( & mut stream, "before-migration" , Duration :: from_secs ( 5 ) ) ?;
246+ ensure ! (
247+ reply == "before-migration" ,
248+ "unexpected echo reply before migration: {reply}"
249+ ) ;
250+ debug ! ( "echo verified before migration" ) ;
325251
326252 // Spawn destination in incoming mode with its own cidata copy
327253 let dst_cfg = base_cfg
@@ -338,13 +264,13 @@ pub(crate) fn test_live_migration_os(machine: Machine, smp: u8) -> Result<()> {
338264 drop ( src) ;
339265 debug ! ( "source VM terminated" ) ;
340266
341- // Verify the persistent SSH session survived migration
342- let uptime_after = parse_uptime ( & session. run ( "cat /proc/uptime" ) ?) ?;
343- debug ! ( "uptime after migration: {uptime_after:.1}s" ) ;
267+ // Verify the same TCP connection still works after migration
268+ let reply = echo_roundtrip ( & mut stream, "after-migration" , Duration :: from_secs ( 10 ) ) ?;
344269 ensure ! (
345- uptime_after > uptime_before ,
346- "timer not advancing after migration: {uptime_before:.1}s -> {uptime_after:.1}s "
270+ reply == "after-migration" ,
271+ "unexpected echo reply after migration: {reply} "
347272 ) ;
273+ debug ! ( "echo verified after migration — TCP connection survived" ) ;
348274
349275 Ok ( ( ) )
350276}
0 commit comments