[RFC PATCH v2 8/8] WIP: kernel/time: Add /dev/vmclock_host miscdev

From: David Woodhouse

Date: Sun May 17 2026 - 18:04:55 EST


From: David Woodhouse <dwmw@xxxxxxxxxxxx>

Expose the host's NTP-disciplined clock as a vmclock_abi page via
/dev/vmclock_host. A VMM can mmap or poll() this device to obtain
precision time parameters for relaying to guests.

The page is updated only when ntp_tick changes (i.e., when NTP
actually adjusts the frequency), not on every timekeeping tick.
This avoids the per-tick overhead of the existing pvclock_gtod
notifier while providing the same information.

Fields populated:
- counter_id: X86_TSC
- time_type: TAI
- counter_value: TSC at reference point
- time_sec/time_frac_sec: TAI at reference point
- counter_period_frac_sec: NTP-disciplined TSC period
- tai_offset_sec: current UTC-TAI offset

NOT YET DONE:
- Error bounds (esterror/maxerror)
- Leap second indicator
- Disruption marker (needs clocksource change hook)
- Selftest
---
include/linux/vmclock_host.h | 17 +
kernel/time/Kconfig | 8 +
kernel/time/Makefile | 1 +
kernel/time/ntp.c | 3 +-
kernel/time/ntp_internal.h | 1 +
kernel/time/timekeeping.c | 6 +
kernel/time/vmclock_host.c | 319 ++++++++++++++++++
.../selftests/timers/vmclock_host_test.c | 171 ++++++++++
8 files changed, 525 insertions(+), 1 deletion(-)
create mode 100644 include/linux/vmclock_host.h
create mode 100644 kernel/time/vmclock_host.c
create mode 100644 tools/testing/selftests/timers/vmclock_host_test.c

diff --git a/include/linux/vmclock_host.h b/include/linux/vmclock_host.h
new file mode 100644
index 000000000000..388a5a1b470c
--- /dev/null
+++ b/include/linux/vmclock_host.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VMCLOCK_HOST_H
+#define _LINUX_VMCLOCK_HOST_H
+
+struct timekeeper;
+
+extern void (*vmclock_host_update_fn)(struct timekeeper *tk);
+
+static inline void vmclock_host_update(struct timekeeper *tk)
+{
+ typeof(vmclock_host_update_fn) fn = READ_ONCE(vmclock_host_update_fn);
+
+ if (fn)
+ fn(tk);
+}
+
+#endif /* _LINUX_VMCLOCK_HOST_H */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 02aac7c5aa76..493ffda434a8 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -221,4 +221,12 @@ config POSIX_AUX_CLOCKS
and other clock domains, which are not correlated to the TAI/NTP
notion of time.

+config VMCLOCK_HOST
+ tristate "VMClock host time provider (/dev/vmclock_host)"
+ depends on X86_TSC || ARM64
+ help
+ Expose the host NTP-disciplined clock as a vmclock page via
+ /dev/vmclock_host for VMMs to relay precision time to guests.
+
endmenu
+
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index eaf290c972f9..549070254e3a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_TIME_NS) += namespace.o
obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o
obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
+obj-$(CONFIG_VMCLOCK_HOST) += vmclock_host.o
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f4bf7e78c230..e60d9f7da9e3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -676,10 +676,11 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns
* ntp_synced - Tells whether the NTP status is not UNSYNC
* Returns: true if not UNSYNC, false otherwise
*/
-static inline bool ntp_synced(void)
+bool ntp_synced(void)
{
return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
}
+EXPORT_SYMBOL_GPL(ntp_synced);

/*
* If we have an externally synchronized Linux clock, then update RTC clock
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index d0460449eb50..0a5d26b22d6a 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -3,6 +3,7 @@
#define _LINUX_NTP_INTERNAL_H

extern void ntp_init(void);
+extern bool ntp_synced(void);
extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(unsigned int tkid);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f20bc76f43ca..37d30283ad60 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -27,6 +27,10 @@
#include "tick-internal.h"
#include "timekeeping_internal.h"
#include "ntp_internal.h"
+#include <linux/vmclock_host.h>
+
+void (*vmclock_host_update_fn)(struct timekeeper *tk);
+EXPORT_SYMBOL_GPL(vmclock_host_update_fn);

#define TK_CLEAR_NTP (1 << 0)
#define TK_CLOCK_WAS_SET (1 << 1)
@@ -2390,6 +2394,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
}

+ vmclock_host_update(tk);
+
/*
* If the clock is behind the NTP time, increase the multiplier by 1
* to catch up with it. If it's ahead and there was a remainder in the
diff --git a/kernel/time/vmclock_host.c b/kernel/time/vmclock_host.c
new file mode 100644
index 000000000000..f4baf9069e70
--- /dev/null
+++ b/kernel/time/vmclock_host.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /dev/vmclock_host - Expose host NTP-disciplined time as a vmclock page.
+ *
+ * This provides a vmclock_abi structure populated from the host's
+ * CLOCK_REALTIME (TAI), allowing a VMM to efficiently relay precision
+ * time to guests without per-tick overhead.
+ *
+ * The page is updated only when the NTP frequency (ntp_tick) changes
+ * or the clocksource changes — not on every timekeeping tick.
+ * Userspace can poll() for changes.
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates.
+ */
+
+#include <linux/clocksource_ids.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/wait.h>
+
+#include <uapi/linux/vmclock-abi.h>
+
+extern void (*vmclock_host_update_fn)(struct timekeeper *tk);
+extern bool ntp_synced(void);
+
+static struct vmclock_abi *vmclock_page;
+static DECLARE_WAIT_QUEUE_HEAD(vmclock_wait);
+static u64 vmclock_last_ntp_tick = 1; /* Sentinel: force first update */
+static enum clocksource_ids vmclock_last_cs_id;
+
+/*
+ * Compute counter_period_frac_sec from ntp_tick and cycle_interval.
+ *
+ * ntp_tick is ns_per_tick << 32.
+ * cycle_interval is counter cycles per tick.
+ *
+ * vmclock wants: period = frac_sec / 2^(64 + shift) in seconds.
+ *
+ * ns_per_cycle = ntp_tick / cycle_interval (in <<32 fixed point)
+/*
+ * Compute counter_period_frac_sec from ntp_tick and cycle_interval.
+ *
+ * period = ntp_tick / (cycle_interval * 10^9 * 2^32) seconds/cycle
+ * frac_sec = ntp_tick * 2^(32+shift) / (cycle_interval * 10^9)
+ *
+ * Use div64_u64 with maximum pre-shift for precision.
+ * The key: do TWO divisions to get 64 bits of quotient.
+ */
+static void vmclock_compute_period(struct timekeeper *tk,
+ u64 *period_frac, u8 *period_shift)
+{
+ u64 ntp_tick = tk->ntp_tick;
+ u64 cycle_interval = tk->cycle_interval;
+ u64 divisor = cycle_interval * 1000000000ULL;
+ int headroom = __builtin_clzll(ntp_tick);
+ u64 rem, result;
+ int bits_so_far, need;
+
+ /*
+ * Compute ntp_tick * 2^(headroom + N) / divisor with 64 bits
+ * of precision, using iterative 32-bit chunk divisions.
+ *
+ * First division: ntp_tick << headroom / divisor
+ */
+ result = div64_u64_rem(ntp_tick << headroom, divisor, &rem);
+ bits_so_far = 64 - __builtin_clzll(result ?: 1);
+
+ /* Fill remaining bits 32 at a time from the remainder */
+ while (bits_so_far < 64 && rem) {
+ int chunk = min(32, 64 - bits_so_far);
+ int rem_headroom = __builtin_clzll(rem);
+ u64 extra;
+
+ if (rem_headroom < chunk)
+ chunk = rem_headroom;
+
+ extra = div64_u64_rem(rem << chunk, divisor, &rem);
+ result = (result << chunk) | extra;
+ bits_so_far += chunk;
+ headroom += chunk;
+ }
+
+ /* Pad with zeros if we ran out of remainder */
+ if (bits_so_far < 64) {
+ result <<= (64 - bits_so_far);
+ headroom += (64 - bits_so_far);
+ }
+
+ /*
+ * result = ntp_tick * 2^headroom / divisor
+ * = (ntp_tick / (cycle_interval * 10^9)) * 2^headroom
+ * = period_seconds * 2^32 * 2^headroom
+ * = period_seconds * 2^(32 + headroom)
+ *
+ * vmclock: frac_sec / 2^(64 + shift) = period_seconds
+ * So: shift = 32 + headroom - 64 = headroom - 32
+ */
+ *period_frac = result;
+ *period_shift = (u8)(headroom - 32);
+}
+
+
+static u8 vmclock_counter_id(struct timekeeper *tk)
+{
+ enum clocksource_ids id = tk->cs_id;
+
+ if (IS_ENABLED(CONFIG_X86) && id == CSID_X86_TSC)
+ return VMCLOCK_COUNTER_X86_TSC;
+ if (IS_ENABLED(CONFIG_ARM64) && id == CSID_ARM_ARCH_COUNTER)
+ return VMCLOCK_COUNTER_ARM_VCNT;
+ return VMCLOCK_COUNTER_INVALID;
+}
+
+/*
+ * Called from timekeeping_adjust() when ntp_tick changes.
+ * Also needs to be called on clocksource change.
+ */
+static void vmclock_host_do_update(struct timekeeper *tk)
+{
+ struct vmclock_abi *clk = vmclock_page;
+ u64 period_frac;
+ u8 period_shift, counter_id;
+
+ if (!clk)
+ return;
+
+ counter_id = vmclock_counter_id(tk);
+
+ /* Only do a full update when something meaningful changes */
+ if (tk->ntp_tick == vmclock_last_ntp_tick &&
+ tk->cs_id == vmclock_last_cs_id)
+ return;
+
+ vmclock_last_ntp_tick = tk->ntp_tick;
+ vmclock_last_cs_id = tk->cs_id;
+
+ /* Increment seq_count to odd (update in progress) */
+ WRITE_ONCE(clk->seq_count, cpu_to_le32(le32_to_cpu(clk->seq_count) + 1));
+ smp_wmb();
+
+ clk->counter_id = counter_id;
+
+ if (counter_id != VMCLOCK_COUNTER_INVALID) {
+ u64 ns = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ u64 hi, rem;
+
+ /* Adjust for ntp_error: represent where the clock is
+ * converging TO, not where it is right now. */
+ ns += tk->ntp_error >> (tk->tkr_mono.shift + tk->ntp_error_shift);
+
+ clk->counter_value = cpu_to_le64(tk->tkr_mono.cycle_last);
+ clk->time_sec = cpu_to_le64(tk->xtime_sec + tk->tai_offset);
+
+ hi = div64_u64_rem(ns << 32, 1000000000ULL, &rem);
+ clk->time_frac_sec = cpu_to_le64(
+ (hi << 32) | div64_u64(rem << 32, 1000000000ULL));
+
+ vmclock_compute_period(tk,
+ &period_frac, &period_shift);
+ clk->counter_period_frac_sec = cpu_to_le64(period_frac);
+ clk->counter_period_shift = period_shift;
+
+ clk->clock_status = ntp_synced() ?
+ VMCLOCK_STATUS_SYNCHRONIZED :
+ VMCLOCK_STATUS_FREERUNNING;
+ } else {
+ clk->clock_status = VMCLOCK_STATUS_UNKNOWN;
+ }
+
+ clk->tai_offset_sec = cpu_to_le16((s16)tk->tai_offset);
+ clk->flags = cpu_to_le64(VMCLOCK_FLAG_TAI_OFFSET_VALID |
+ VMCLOCK_FLAG_TIME_MONOTONIC |
+ VMCLOCK_FLAG_NOTIFICATION_PRESENT);
+
+ smp_wmb();
+ WRITE_ONCE(clk->seq_count, cpu_to_le32(le32_to_cpu(clk->seq_count) + 1));
+
+ wake_up_interruptible(&vmclock_wait);
+}
+
+/* File operations */
+
+struct vmclock_host_file {
+ u32 last_seq;
+};
+
+static int vmclock_host_open(struct inode *inode, struct file *fp)
+{
+ struct vmclock_host_file *fst;
+
+ fst = kzalloc(sizeof(*fst), GFP_KERNEL);
+ if (!fst)
+ return -ENOMEM;
+
+ fp->private_data = fst;
+ return 0;
+}
+
+static int vmclock_host_release(struct inode *inode, struct file *fp)
+{
+ kfree(fp->private_data);
+ return 0;
+}
+
+static int vmclock_host_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & (VM_READ | VM_WRITE)) != VM_READ)
+ return -EROFS;
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(vmclock_page) >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot);
+}
+
+static ssize_t vmclock_host_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct vmclock_host_file *fst = fp->private_data;
+ u32 seq;
+
+ if (*ppos >= PAGE_SIZE)
+ return 0;
+ if (count > PAGE_SIZE - *ppos)
+ count = PAGE_SIZE - *ppos;
+
+ do {
+ seq = le32_to_cpu(READ_ONCE(vmclock_page->seq_count));
+ if (seq & 1) {
+ cpu_relax();
+ continue;
+ }
+ smp_rmb();
+ if (copy_to_user(buf, (char *)vmclock_page + *ppos, count))
+ return -EFAULT;
+ smp_rmb();
+ } while (le32_to_cpu(READ_ONCE(vmclock_page->seq_count)) != seq);
+
+ fst->last_seq = seq;
+ *ppos += count;
+ return count;
+}
+
+static __poll_t vmclock_host_poll(struct file *fp, poll_table *wait)
+{
+ struct vmclock_host_file *fst = fp->private_data;
+ u32 seq;
+
+ poll_wait(fp, &vmclock_wait, wait);
+
+ seq = le32_to_cpu(READ_ONCE(vmclock_page->seq_count));
+ if (fst->last_seq != seq)
+ return EPOLLIN | EPOLLRDNORM;
+
+ return 0;
+}
+
+static const struct file_operations vmclock_host_fops = {
+ .owner = THIS_MODULE,
+ .open = vmclock_host_open,
+ .release = vmclock_host_release,
+ .mmap = vmclock_host_mmap,
+ .read = vmclock_host_read,
+ .poll = vmclock_host_poll,
+};
+
+static struct miscdevice vmclock_host_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "vmclock_host",
+ .fops = &vmclock_host_fops,
+};
+
+static int __init vmclock_host_init(void)
+{
+ int ret;
+
+ vmclock_page = (struct vmclock_abi *)get_zeroed_page(GFP_KERNEL);
+ if (!vmclock_page)
+ return -ENOMEM;
+
+ /* Set constant fields */
+ vmclock_page->magic = cpu_to_le32(VMCLOCK_MAGIC);
+ vmclock_page->size = cpu_to_le32(PAGE_SIZE);
+ vmclock_page->version = cpu_to_le16(1);
+ vmclock_page->time_type = VMCLOCK_TIME_TAI;
+
+ ret = misc_register(&vmclock_host_miscdev);
+ if (ret) {
+ free_page((unsigned long)vmclock_page);
+ vmclock_page = NULL;
+ return ret;
+ }
+
+ WRITE_ONCE(vmclock_host_update_fn, vmclock_host_do_update);
+ pr_info("vmclock_host: registered /dev/vmclock_host\n");
+ return 0;
+}
+
+static void __exit vmclock_host_exit(void)
+{
+ WRITE_ONCE(vmclock_host_update_fn, NULL);
+ synchronize_rcu();
+ misc_deregister(&vmclock_host_miscdev);
+ free_page((unsigned long)vmclock_page);
+ vmclock_page = NULL;
+}
+
+module_init(vmclock_host_init);
+module_exit(vmclock_host_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Woodhouse <dwmw@xxxxxxxxxxxx>");
+MODULE_DESCRIPTION("VMClock host time provider");
diff --git a/tools/testing/selftests/timers/vmclock_host_test.c b/tools/testing/selftests/timers/vmclock_host_test.c
new file mode 100644
index 000000000000..c83cc7e6d404
--- /dev/null
+++ b/tools/testing/selftests/timers/vmclock_host_test.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test /dev/vmclock_host by comparing its time against CLOCK_TAI.
+ *
+ * Maps the vmclock page, reads time from it using the ABI formula,
+ * and compares with clock_gettime(CLOCK_TAI) using ABA timestamps
+ * to bound the uncertainty.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/vmclock-abi.h>
+
+#ifdef __x86_64__
+static inline uint64_t read_counter(void)
+{
+ unsigned int lo, hi;
+ asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+ return ((uint64_t)hi << 32) | lo;
+}
+#elif defined(__aarch64__)
+static inline uint64_t read_counter(void)
+{
+ uint64_t val;
+ asm volatile("mrs %0, cntvct_el0" : "=r"(val));
+ return val;
+}
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * Compute time from vmclock: T = time_sec + time_frac_sec/2^64 +
+ * (counter_now - counter_value) * counter_period_frac_sec >> (64 + shift)
+ *
+ * Returns nanoseconds since epoch.
+ */
+static int64_t vmclock_read_ns(const volatile struct vmclock_abi *clk,
+ uint64_t counter_now)
+{
+ uint64_t delta = counter_now - clk->counter_value;
+ uint64_t period = clk->counter_period_frac_sec;
+ uint8_t shift = clk->counter_period_shift;
+ __uint128_t ns128;
+
+ /* delta * period gives seconds in 0.(64+shift) fixed point */
+ ns128 = (__uint128_t)delta * period;
+ ns128 >>= shift;
+ /* Now ns128 is seconds in 0.64 fixed point. Add time_frac_sec */
+ ns128 += clk->time_frac_sec;
+ /* Top 64 bits are whole seconds of fractional part — but we
+ * need to add time_sec for the full result */
+ uint64_t frac_sec = (uint64_t)(ns128 >> 64);
+ uint64_t sub_sec_ns = (uint64_t)(((ns128 & 0xFFFFFFFFFFFFFFFFULL) *
+ 1000000000ULL) >> 64);
+
+ return (int64_t)(clk->time_sec + frac_sec) * 1000000000LL + sub_sec_ns;
+}
+
+static int64_t clock_tai_ns(void)
+{
+ struct timespec ts;
+ clock_gettime(CLOCK_TAI, &ts);
+ return (int64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+int main(void)
+{
+ int fd, ret = 0;
+ volatile struct vmclock_abi *clk;
+ int i, failures = 0;
+
+ fd = open("/dev/vmclock_host", O_RDONLY);
+ if (fd < 0) {
+ if (errno == ENOENT) {
+ printf("SKIP: /dev/vmclock_host not available\n");
+ return 4;
+ }
+ perror("open /dev/vmclock_host");
+ return 1;
+ }
+
+ clk = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, 0);
+ if (clk == MAP_FAILED) {
+ perror("mmap");
+ close(fd);
+ return 1;
+ }
+
+ if (clk->magic != VMCLOCK_MAGIC) {
+ fprintf(stderr, "Bad magic: 0x%x\n", clk->magic);
+ ret = 1;
+ goto out;
+ }
+
+ if (clk->counter_id == VMCLOCK_COUNTER_INVALID) {
+ printf("SKIP: counter_id is INVALID (clocksource not TSC?)\n");
+ ret = 4;
+ goto out;
+ }
+
+ printf("vmclock_host: version=%u counter_id=%u time_type=%u status=%u\n",
+ clk->version, clk->counter_id, clk->time_type, clk->clock_status);
+ printf(" tai_offset=%d\n", (int16_t)clk->tai_offset_sec);
+ printf(" counter_period_frac_sec=0x%" PRIx64 " shift=%u\n",
+ (uint64_t)clk->counter_period_frac_sec, clk->counter_period_shift);
+
+ /* ABA comparison: read CLOCK_TAI, vmclock, CLOCK_TAI */
+ printf("\nABA comparison (vmclock vs CLOCK_TAI):\n");
+ for (i = 0; i < 10; i++) {
+ uint32_t seq;
+ int64_t tai_before, tai_after, vmclock_ns;
+ int64_t delta, window;
+
+ /* Read with seqcount retry */
+ do {
+ seq = clk->seq_count;
+ if (seq & 1) {
+ __asm__ volatile("pause" ::: "memory");
+ continue;
+ }
+ __asm__ volatile("" ::: "memory");
+
+ tai_before = clock_tai_ns();
+ uint64_t ctr = read_counter();
+ tai_after = clock_tai_ns();
+
+ __asm__ volatile("" ::: "memory");
+ if (clk->seq_count != seq)
+ continue;
+
+ vmclock_ns = vmclock_read_ns(clk, ctr);
+ break;
+ } while (1);
+
+ window = tai_after - tai_before;
+ /* vmclock should be between tai_before and tai_after */
+ delta = vmclock_ns - tai_before;
+
+ printf(" [%d] vmclock-tai_before=%+" PRId64 "ns window=%"
+ PRId64 "ns", i, delta, window);
+
+ if (delta < -2000 || delta > window + 2000) {
+ printf(" FAIL (out of range)\n");
+ failures++;
+ } else {
+ printf(" OK\n");
+ }
+
+ usleep(100000); /* 100ms between samples */
+ }
+
+ if (failures) {
+ printf("\nFAIL: %d/%d samples out of range\n", failures, 10);
+ ret = 1;
+ } else {
+ printf("\nPASS: all samples within ABA window\n");
+ }
+
+out:
+ munmap((void *)clk, 4096);
+ close(fd);
+ return ret;
+}
--
2.51.0