Skip to content

Commit 358f89b

Browse files
committed
Replace ssh with tcp-echo in migration test
Signed-off-by: Magnus Kulke <magnuskulke@microsoft.com>
1 parent a54e353 commit 358f89b

1 file changed

Lines changed: 74 additions & 148 deletions

File tree

src/tests/migration.rs

Lines changed: 74 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
use crate::cloud_init::{CloudInitDisk, GUEST_USER};
22
use crate::process::CpuModel as Cpu;
33
use crate::process::{ExpectedOutput, Machine, QemuConfig, QemuPayload, QemuProcess, RtcClock};
4-
use crate::tests::full_os::{OS_READY_PATTERN, SSH_ARGS, ssh_command};
4+
use crate::tests::full_os::{OS_READY_PATTERN, ssh_command};
55
use crate::util::{NetConfig, allocate_taps, generate_mac};
66
use anyhow::{Context, Result, bail, ensure};
77
use log::debug;
88
use qapi::qmp::{self, RunState};
9-
use std::path::{Path, PathBuf};
10-
use std::process::{Child, Command, Stdio};
9+
use std::io::{BufRead, BufReader, Write};
10+
use std::net::TcpStream;
11+
use std::thread::sleep;
1112
use std::time::{Duration, Instant};
1213
use test_macro::test_fn;
1314

@@ -18,144 +19,59 @@ const INITRD: &str = "payload/initrd.img";
1819
const OS_IMAGE: &str = "payload/os-image.qcow2";
1920
const OS_BOOT_TIMEOUT: Duration = Duration::from_secs(60);
2021
const SSH_TIMEOUT: Duration = Duration::from_secs(30);
21-
22-
/// Persistent SSH connection using ControlMaster.
23-
/// The TCP session should survive VM live migration on a bridge,
24-
/// proving that network state migrated correctly.
25-
struct SshSession {
26-
child: Child,
27-
control_path: PathBuf,
28-
key_path: PathBuf,
29-
host: String,
30-
user: String,
22+
const ECHO_PORT: u16 = 7777;
23+
const ECHO_SERVER_CMD: &str = concat!(
24+
"nohup python3 -c '",
25+
"import socket; ",
26+
"s=socket.socket(); ",
27+
"s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1); ",
28+
"s.bind((\"0.0.0.0\",7777)); ",
29+
"s.listen(1); ",
30+
"c,_=s.accept(); ",
31+
"[c.sendall(d) for d in iter(lambda:c.recv(4096),b\"\")]",
32+
"' </dev/null >/dev/null 2>&1 &",
33+
);
34+
35+
/// Send a line over a TcpStream and read the echoed response.
36+
fn echo_roundtrip(stream: &mut TcpStream, msg: &str, timeout: Duration) -> Result<String> {
37+
stream
38+
.set_write_timeout(Some(timeout))
39+
.context("set_write_timeout")?;
40+
stream
41+
.set_read_timeout(Some(timeout))
42+
.context("set_read_timeout")?;
43+
writeln!(stream, "{msg}").context("echo write")?;
44+
stream.flush().context("echo flush")?;
45+
let mut reader = BufReader::new(stream.try_clone().context("clone stream")?);
46+
let mut line = String::new();
47+
reader.read_line(&mut line).context("echo read")?;
48+
Ok(line.trim().to_string())
3149
}
3250

33-
impl SshSession {
34-
/// Open a ControlMaster connection. Call only after SSH is known to be
35-
/// reachable (e.g. after a successful `ssh_command`).
36-
fn open(
37-
key_path: &Path,
38-
host: &str,
39-
port: u16,
40-
user: &str,
41-
dir: &Path,
42-
timeout: Duration,
43-
) -> Result<Self> {
44-
let control_path = dir.join("ssh_ctl");
45-
let key_str = key_path.to_string_lossy().to_string();
46-
let ctl_str = format!("ControlPath={}", control_path.display());
47-
let port_str = port.to_string();
48-
let user_host = format!("{user}@{host}");
49-
let var_args: Vec<&str> = vec![
50-
"-i",
51-
&key_str,
52-
"-o",
53-
"ControlMaster=yes",
54-
"-o",
55-
&ctl_str,
56-
"-o",
57-
"ControlPersist=yes",
58-
"-p",
59-
&port_str,
60-
"-N",
61-
&user_host,
62-
];
63-
let mut args = SSH_ARGS.to_vec();
64-
args.extend_from_slice(&var_args);
65-
66-
let mut child = Command::new("ssh")
67-
.args(&args)
68-
.stdin(Stdio::null())
69-
.stdout(Stdio::null())
70-
.stderr(Stdio::piped())
71-
.spawn()
72-
.context("failed to start SSH ControlMaster")?;
73-
74-
let start = Instant::now();
75-
while !control_path.exists() {
76-
if start.elapsed() > timeout {
77-
let _ = child.kill();
78-
let _ = child.wait();
79-
bail!("SSH ControlMaster socket did not appear within {timeout:?}");
80-
}
81-
if let Some(status) = child.try_wait().context("failed to check ssh process")? {
82-
let stderr = child
83-
.stderr
84-
.take()
85-
.map(|s| std::io::read_to_string(s).unwrap_or_default())
86-
.unwrap_or_default();
87-
bail!("SSH ControlMaster exited early ({status}): {stderr}");
51+
/// Connect to the guest echo server, retrying until `timeout`.
52+
fn connect_echo(host: &str, port: u16, timeout: Duration) -> Result<TcpStream> {
53+
let start = Instant::now();
54+
loop {
55+
sleep(Duration::from_millis(200));
56+
if crate::SHUTDOWN.load(std::sync::atomic::Ordering::Relaxed) {
57+
bail!("interrupted");
58+
}
59+
match TcpStream::connect_timeout(
60+
&format!("{host}:{port}").parse().context("parse addr")?,
61+
Duration::from_secs(2),
62+
) {
63+
Ok(stream) => return Ok(stream),
64+
Err(e) => {
65+
if start.elapsed() > timeout {
66+
bail!("echo server not reachable after {timeout:?}: {e}");
67+
}
68+
debug!("echo connect failed ({e}), retrying...");
69+
std::thread::sleep(Duration::from_secs(1));
8870
}
89-
std::thread::sleep(Duration::from_millis(200));
9071
}
91-
debug!(
92-
"SSH ControlMaster established at {}",
93-
control_path.display()
94-
);
95-
96-
Ok(Self {
97-
child,
98-
control_path,
99-
key_path: key_path.to_path_buf(),
100-
host: host.to_string(),
101-
user: user.to_string(),
102-
})
103-
}
104-
105-
/// Run a command through the persistent connection.
106-
fn run(&self, command: &str) -> Result<String> {
107-
let var_args = [
108-
"-i",
109-
&self.key_path.to_string_lossy(),
110-
"-o",
111-
&format!("ControlPath={}", self.control_path.display()),
112-
&format!("{}@{}", self.user, self.host),
113-
command,
114-
];
115-
let mut args = SSH_ARGS.to_vec();
116-
args.extend_from_slice(&var_args);
117-
118-
let output = Command::new("ssh")
119-
.args(args)
120-
.output()
121-
.context("failed to run ssh command via control socket")?;
122-
123-
ensure!(
124-
output.status.success(),
125-
"SSH command failed: {}",
126-
String::from_utf8_lossy(&output.stderr)
127-
);
128-
let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
129-
debug!("ssh (ctl) output: {stdout}");
130-
Ok(stdout)
131-
}
132-
}
133-
134-
impl Drop for SshSession {
135-
fn drop(&mut self) {
136-
debug!("closing SSH ControlMaster");
137-
let _ = Command::new("ssh")
138-
.args([
139-
"-o",
140-
&format!("ControlPath={}", self.control_path.display()),
141-
"-O",
142-
"exit",
143-
"dummy",
144-
])
145-
.output();
146-
let _ = self.child.wait();
14772
}
14873
}
14974

150-
/// Parse the first field of `/proc/uptime` (seconds since boot).
151-
fn parse_uptime(raw: &str) -> Result<f64> {
152-
raw.split_whitespace()
153-
.next()
154-
.context("empty uptime output")?
155-
.parse()
156-
.context("failed to parse uptime")
157-
}
158-
15975
fn do_migration(
16076
src: &mut QemuProcess,
16177
dst: &mut QemuProcess,
@@ -296,7 +212,7 @@ pub(crate) fn test_live_migration_os(machine: Machine, smp: u8) -> Result<()> {
296212
.context("source VM did not boot")?;
297213
debug!("source VM booted");
298214

299-
// Verify SSH on source (with retries until guest network is ready)
215+
// Wait for SSH to become available
300216
ssh_command(
301217
&ci.ssh_key_path,
302218
taps.guest_host(),
@@ -308,20 +224,30 @@ pub(crate) fn test_live_migration_os(machine: Machine, smp: u8) -> Result<()> {
308224
.context("SSH not reachable on source")?;
309225
debug!("source SSH is reachable");
310226

311-
// Open persistent SSH connection (SSH is known-reachable at this point)
312-
let session = SshSession::open(
227+
// Start a TCP echo server in the guest using python3 (always available)
228+
ssh_command(
313229
&ci.ssh_key_path,
314230
taps.guest_host(),
315231
22,
316232
GUEST_USER,
317-
mig_dir.path(),
318-
Duration::from_secs(10),
233+
ECHO_SERVER_CMD,
234+
SSH_TIMEOUT,
319235
)
320-
.context("failed to open SSH ControlMaster")?;
236+
.context("failed to start echo server")?;
237+
debug!("echo server started on guest port {ECHO_PORT}");
238+
239+
// Open a persistent TCP connection to the echo server
240+
let mut stream = connect_echo(taps.guest_host(), ECHO_PORT, SSH_TIMEOUT)
241+
.context("failed to connect to echo server")?;
242+
debug!("TCP connection established to echo server");
321243

322-
// Record uptime before migration
323-
let uptime_before = parse_uptime(&session.run("cat /proc/uptime")?)?;
324-
debug!("uptime before migration: {uptime_before:.1}s");
244+
// Verify echo works before migration
245+
let reply = echo_roundtrip(&mut stream, "before-migration", Duration::from_secs(5))?;
246+
ensure!(
247+
reply == "before-migration",
248+
"unexpected echo reply before migration: {reply}"
249+
);
250+
debug!("echo verified before migration");
325251

326252
// Spawn destination in incoming mode with its own cidata copy
327253
let dst_cfg = base_cfg
@@ -338,13 +264,13 @@ pub(crate) fn test_live_migration_os(machine: Machine, smp: u8) -> Result<()> {
338264
drop(src);
339265
debug!("source VM terminated");
340266

341-
// Verify the persistent SSH session survived migration
342-
let uptime_after = parse_uptime(&session.run("cat /proc/uptime")?)?;
343-
debug!("uptime after migration: {uptime_after:.1}s");
267+
// Verify the same TCP connection still works after migration
268+
let reply = echo_roundtrip(&mut stream, "after-migration", Duration::from_secs(10))?;
344269
ensure!(
345-
uptime_after > uptime_before,
346-
"timer not advancing after migration: {uptime_before:.1}s -> {uptime_after:.1}s"
270+
reply == "after-migration",
271+
"unexpected echo reply after migration: {reply}"
347272
);
273+
debug!("echo verified after migration — TCP connection survived");
348274

349275
Ok(())
350276
}

0 commit comments

Comments
 (0)