Re: [PATCH v12 14/22] gpu: nova-core: Hopper/Blackwell: add FSP falcon EMEM operations
From: Alexandre Courbot
Date: Tue Jun 02 2026 - 11:03:42 EST
On Tue Jun 2, 2026 at 12:21 PM JST, John Hubbard wrote:
> Add external memory (EMEM) read/write operations to the GPU's FSP falcon
> engine. These operations use Falcon PIO (Programmed I/O) to communicate
> with the FSP through indirect memory access.
>
> Signed-off-by: John Hubbard <jhubbard@xxxxxxxxxx>
> ---
> drivers/gpu/nova-core/falcon/fsp.rs | 130 ++++++++++++++++++++++++++--
> drivers/gpu/nova-core/regs.rs | 15 ++++
> 2 files changed, 140 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/nova-core/falcon/fsp.rs b/drivers/gpu/nova-core/falcon/fsp.rs
> index d9f87262e8b1..6b057d958115 100644
> --- a/drivers/gpu/nova-core/falcon/fsp.rs
> +++ b/drivers/gpu/nova-core/falcon/fsp.rs
> @@ -6,12 +6,28 @@
> //! The FSP falcon handles secure boot and Chain of Trust operations
> //! on Hopper and Blackwell architectures, replacing SEC2's role.
>
> -use kernel::io::register::RegisterBase;
> +use kernel::{
> + io::{
> + register::{
> + RegisterBase,
> + WithBase, //
> + },
> + Io, //
> + },
> + num::Bounded,
> + prelude::*,
> + ptr::Alignment, //
> +};
>
> -use crate::falcon::{
> - FalconEngine,
> - PFalcon2Base,
> - PFalconBase, //
> +use crate::{
> + driver::Bar0,
> + falcon::{
> + Falcon,
> + FalconEngine,
> + PFalcon2Base,
> + PFalconBase, //
> + },
> + regs,
> };
>
> /// Type specifying the `Fsp` falcon engine. Cannot be instantiated.
> @@ -26,3 +42,107 @@ impl RegisterBase<PFalcon2Base> for Fsp {
> }
>
> impl FalconEngine for Fsp {}
> +
> +/// Maximum addressable EMEM size, derived from the 24-bit offset field
> +/// in `NV_PFALCON_FALCON_EMEM_CTL`.
> +const EMEM_MAX_SIZE: Alignment = Alignment::new::<{ 1 << 24 }>();
> +
> +/// I/O backend for the FSP falcon's external memory (EMEM).
> +///
> +/// `EMEM_CTL` is programmed once with a start offset and an auto-increment
> +/// mode, then each access to `EMEM_DATA` advances the offset by one 32-bit
> +/// word in hardware.
> +struct Emem<'a> {
> + bar: &'a Bar0,
> +}
> +
> +impl<'a> Emem<'a> {
> + fn new(bar: &'a Bar0) -> Self {
> + Self { bar }
> + }
> +
> + /// Programs `EMEM_CTL` with the start byte `offset` and the `ctl` mode bits.
> + ///
> + /// Returns `EINVAL` if `offset` is outside the addressable EMEM window.
> + fn program(&mut self, offset: usize, ctl: regs::NV_PFALCON_FALCON_EMEM_CTL) -> Result {
> + let offset = Bounded::<usize, { EMEM_MAX_SIZE.log2() }>::try_new(offset)
> + .map(Bounded::cast::<u32>)
> + .ok_or(EINVAL)?;
> +
> + self.bar
> + .write(WithBase::of::<Fsp>(), ctl.with_offset(offset));
> +
> + Ok(())
> + }
If we follow Eliot's suggestion to drop `offset` for now, then I guess
will method will bring no extra benefit and can be simply inlined in
`begin_write` and `begin_read`.
> +
> + /// Begins a write burst at byte `offset`, auto-incrementing on each write.
> + fn begin_write(&mut self, offset: usize) -> Result {
> + self.program(
> + offset,
> + regs::NV_PFALCON_FALCON_EMEM_CTL::zeroed().with_auto_increment_write(true),
> + )
> + }
> +
> + /// Begins a read burst at byte `offset`, auto-incrementing on each read.
> + fn begin_read(&mut self, offset: usize) -> Result {
> + self.program(
> + offset,
> + regs::NV_PFALCON_FALCON_EMEM_CTL::zeroed().with_auto_increment_read(true),
> + )
> + }
> +
> + /// Writes the next 32-bit `value`; hardware advances the offset.
> + fn write_next(&mut self, value: u32) {
> + self.bar.write(
> + WithBase::of::<Fsp>(),
> + regs::NV_PFALCON_FALCON_EMEM_DATA::zeroed().with_data(value),
> + );
> + }
> +
> + /// Reads the next 32-bit word; hardware advances the offset.
> + fn read_next(&mut self) -> u32 {
> + self.bar
> + .read(regs::NV_PFALCON_FALCON_EMEM_DATA::of::<Fsp>())
> + .data()
> + }
> +}
> +
> +impl Falcon<Fsp> {
> + /// Writes `data` to FSP external memory at byte `offset`.
> + ///
> + /// `data` is interpreted as little-endian 32-bit words. Returns `EINVAL`
> + /// if `offset` or the `data` length is not 4-byte aligned.
> + #[expect(dead_code)]
> + fn write_emem(&mut self, bar: &Bar0, offset: u32, data: &[u8]) -> Result {
> + if offset % 4 != 0 || data.len() % 4 != 0 {
> + return Err(EINVAL);
> + }
> +
> + let mut emem = Emem::new(bar);
> + emem.begin_write(offset as usize)?;
> + for chunk in data.chunks_exact(4) {
> + emem.write_next(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
> + }
> +
> + Ok(())
> + }
> +
> + /// Reads FSP external memory at byte `offset` into `data`.
> + ///
> + /// `data` is stored as little-endian 32-bit words. Returns `EINVAL` if
> + /// `offset` or the `data` length is not 4-byte aligned.
> + #[expect(dead_code)]
> + fn read_emem(&mut self, bar: &Bar0, offset: u32, data: &mut [u8]) -> Result {
> + if offset % 4 != 0 || data.len() % 4 != 0 {
> + return Err(EINVAL);
> + }
> +
> + let mut emem = Emem::new(bar);
> + emem.begin_read(offset as usize)?;
> + for chunk in data.chunks_exact_mut(4) {
> + chunk.copy_from_slice(&emem.read_next().to_le_bytes());
> + }
> +
> + Ok(())
> + }
> +}
> diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs
> index 2cb1f02f35a4..da7a10c0346a 100644
> --- a/drivers/gpu/nova-core/regs.rs
> +++ b/drivers/gpu/nova-core/regs.rs
> @@ -475,6 +475,21 @@ pub(crate) fn vga_workspace_addr(self) -> Option<u64> {
> pub(crate) NV_PFALCON_FBIF_CTL(u32) @ PFalconBase + 0x00000624 {
> 7:7 allow_phys_no_ctx => bool;
> }
> +
> + // Falcon EMEM PIO registers (used by FSP on Hopper/Blackwell).
> + // These provide the falcon external memory communication interface.
> + pub(crate) NV_PFALCON_FALCON_EMEM_CTL(u32) @ PFalconBase + 0x00000ac0 {
OpenRM calls this register `NV_PFSP_EMEMC`. For some reason it is
defined with an absolute address, despite at least another variant for
the GSP existing. Thus I think it makes sense to keep it relative in
Nova, with maybe a name closer to its OpenRM counterpart, e.g.
`NV_PFALCON_FALCON_EMEMC`.
> + /// EMEM byte offset (must be 4-byte aligned).
> + 23:0 offset;
In OpenRM this is actually two fields, `offs` and `blk`. The two
lowest-bits are also not used, we should do the same both for accuracy
and to make users enforce the 4 bytes alignment.
> + /// Auto-increment the offset after each write.
> + 24:24 auto_increment_write => bool;
> + /// Auto-increment the offset after each read.
> + 25:25 auto_increment_read => bool;
Similarly in OpenRM these fields are `aincw` and `aincr` - let's use the
same names for discoverability.