Skip to main content

Scaling Hybrid Quantum Neural Networks across GPUs with Covalent Cloud and CUDA Quantum

In this tutorial, we’ll use Covalent Cloud to access high-power GPUs to simulate quantum circuits with CUDA Quantum. Specifically, we’ll create a hybrid neural network consisting of quantum and classical layers, then train and evaluate using CUDA Quantum and PyTorch.

Covalent Environment

To set up our local environment, pip-install the following packages:

covalent-cloud>=0.81.0
matplotlib==3.9.2
numpy==1.23.5
pillow==11.0.0
torch==2.4.1
torchvision==0.19.1

We also create a Covalent Cloud environment that mirrors some of these local dependencies.

import covalent as ct
import covalent_cloud as cc

cc.save_api_key("YOUR-API-KEY")
cc.create_env(
name="cuda-quantum",
pip=[
"cuda-quantum==0.8.0", # install in cloud env only
"matplotlib==3.9.1",
"torch==2.4.1",
"torchvision==0.19.1",
],
wait=True
)

Next, we define two Covalent Cloud executors: a CPU-only executor and a GPU-equipped executor. We’ll use the latter for compute-heavy tasks, such as training the neural network. The CPU executor will handle lighter tasks, like downloading data and plotting results.

cpu_executor = cc.CloudExecutor(
env="cuda-quantum",
num_cpus=4,
memory="16GB",
time_limit=60*60 # 1 hour
)
gpu_executor = cc.CloudExecutor(
env="cuda-quantum",
num_gpus=1,
gpu_type="h100",
num_cpus=4,
memory="16GB",
time_limit=60*60
)

Let’s also create a Covalent Cloud volume named “cudaq” for persistent storage. We will include this volume during dispatch to make it accessible to every task.

volume = cc.volume(VOLUME_NAME)

Hybrid Quantum Neural Network

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image
from torch.autograd import Function
from torchvision import datasets, transforms

Our hybrid neural network will be a simple one, for the sake of example. It consists of a convolutional neural network and an appended quantum layer. The code below is based on this CUDA Quantum tutorial example.

class QuantumFunction(Function):

def __init__(self, qubit_count: int, device, kernel, thetas):
"""Define the quantum circuit in CUDA Quantum"""
self.kernel = kernel
self.theta = thetas
self.device = device
qubits = kernel.qalloc(qubit_count)
self.kernel.h(qubits)
# Variational gate parameters which are optimised during training.
kernel.ry(thetas[0], qubits[0])
kernel.rx(thetas[1], qubits[0])

def run(self, thetas: torch.tensor) -> torch.tensor:
"""Execute the quantum circuit to output an expectation value"""
import cudaq # importing here avoids local package requirement

exp_ = cudaq.observe(self.kernel, cudaq.spin.z(0), thetas.tolist()).expectation()
expectation = torch.tensor(exp_, device=self.device)
return expectation

@staticmethod
def forward(ctx, thetas: torch.tensor, quantum_circuit, shift) -> torch.tensor:

# Save shift and quantum_circuit in context to use in backward.
ctx.shift = shift
ctx.quantum_circuit = quantum_circuit

# Calculate expectation value.
expectation = ctx.quantum_circuit.run(thetas)

ctx.save_for_backward(thetas, expectation)
return expectation

@staticmethod
def backward(ctx, grad_output):
"""Backward pass computation via finite difference parameter shift"""
thetas, expectation = ctx.saved_tensors

device = ctx.quantum_circuit.device

gradients = torch.zeros(len(thetas), device=device)
for i in range(len(thetas)):
shift_right = torch.clone(thetas)
shift_right[i] += ctx.shift
shift_left = torch.clone(thetas)
shift_left[i] -= ctx.shift

expectation_right = ctx.quantum_circuit.run(shift_right)
expectation_left = ctx.quantum_circuit.run(shift_left)
gradients[i] = (expectation_right -
expectation_left) / 2 * ctx.shift
return gradients * grad_output.float(), None, None


class QuantumLayer(nn.Module):
"""Encapsulates a quantum circuit and a quantum function into a quantum layer"""

def __init__(
self, qubit_count: int, shift: torch.tensor, device, kernel, thetas
):
super(QuantumLayer, self).__init__()

self.device = device
# 1 qubit quantum circuit.
self.quantum_circuit = QuantumFunction(
qubit_count, device, kernel, thetas
)
self.shift = shift

def forward(self, input):
ans = QuantumFunction.apply(input, self.quantum_circuit, self.shift)
return ans


class Net(nn.Module):

def __init__(self, device, kernel, thetas, qubit_count=1, shift=torch.tensor(np.pi / 2)):
super(Net, self).__init__()

# Neural network structure.
self.conv1 = nn.Conv2d(1, 6, kernel_size=5)
self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
self.dropout = nn.Dropout2d()
self.fc1 = nn.Linear(256, 64)
self.fc2 = nn.Linear(
64, 2
) # Output a 2D tensor since we have 2 variational parameters in our quantum circuit.
self.hybrid = QuantumLayer(
qubit_count, shift, device, kernel, thetas
) # Input is the magnitude of the parameter shifts to calculate gradients.

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = self.dropout(x)
x = x.view(1, -1)
x = F.relu(self.fc1(x))
# Reshapes required to satisfy input dimensions to CUDA Quantum.
x = self.fc2(x).reshape(-1)
x = self.hybrid(x).reshape(-1)
return torch.cat((x, 1 - x), -1).unsqueeze(0)

Running with Covalent Cloud

It's time to write our first electron. This electron will be a function that downloads the training data (MNIST) to our cloud volume. Note below that the electron uses cpu_executor — this task will run in Covalent Cloud, but we don’t need any GPUs for it.

@ct.electron(executor=cpu_executor)
def get_data(train_sample_count, test_sample_count):
data_path = volume / "data"
X_train = datasets.MNIST(
root=data_path,
train=True,
download=True,
transform=transforms.Compose([transforms.ToTensor()]),
)
# Leaving only labels 0 and 1.
idx = np.append(
np.where(X_train.targets == 0)[0][:train_sample_count],
np.where(X_train.targets == 1)[0][:train_sample_count],
)
X_train.data = X_train.data[idx]
X_train.targets = X_train.targets[idx]
train_loader = torch.utils.data.DataLoader(X_train, batch_size=1, shuffle=True)

X_test = datasets.MNIST(
root=data_path,
train=False,
download=True,
transform=transforms.Compose([transforms.ToTensor()]),
)
idx = np.append(
np.where(X_test.targets == 0)[0][:test_sample_count],
np.where(X_test.targets == 1)[0][:test_sample_count],
)
X_test.data = X_test.data[idx]
X_test.targets = X_test.targets[idx]
test_loader = torch.utils.data.DataLoader(X_test, batch_size=1, shuffle=True)

return train_loader, test_loader

The next electron will be a function that trains our hybrid neural network. It uses PyTorch and therefore benefits from GPU acceleration, if available. We therefore make sure the gpu_executor is specified for this electron.

@ct.electron(executor=gpu_executor)
def train_model(train_loader, learning_rate, epochs):
import cudaq # importing here avoids local package requirement

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

kernel, thetas = cudaq.make_kernel(list)
model = Net(device, kernel, thetas).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.NLLLoss().to(device)

epochs = epochs
epoch_loss = []
model.train()

for epoch in range(epochs):
batch_loss = 0.0
for batch_idx, (data, target) in enumerate(train_loader): # Batch training.
optimizer.zero_grad()
data, target = data.to(device), target.to(device)

# Forward pass.
output = model(data).to(device)
# Calculating loss.
loss = loss_func(output, target).to(device)

# Backward pass.
loss.backward()

# Optimize the weights.
optimizer.step()
batch_loss += loss.item()

epoch_loss.append(batch_loss / batch_idx)

print("Training [{:.0f}%]\tLoss: {:.4f}".format(100.0 * (epoch + 1) / epochs, epoch_loss[-1]))

# save model
model_path = volume / "model.pth"
torch.save(model.state_dict(), model_path)

return model_path, epoch_loss

Let's also have a simple “plotting” electron to visualize how training progressed.

@ct.electron(executor=cpu_executor)
def plot_loss(epoch_loss):
plt.plot(epoch_loss)
plt.title("Hybrid NN Training Convergence")
plt.xlabel("Training Iterations")
plt.ylabel("Neg Log Likelihood Loss")
plt.savefig('/tmp/plot.png')
return Image.open('/tmp/plot.png')

Finally, the last electron will do model evaluations on unseen data to validate the model performance after training. We use the gpu_executor here as well, to access GPUs and speed up evaluation.

@ct.electron(executor=gpu_executor)
def evaluate(test_loader, model_path):
import cudaq

device = torch.device("cuda:0")

kernel, thetas = cudaq.make_kernel(list)
# load model
model = Net(device, kernel, thetas).to(device)
model.load_state_dict(torch.load(model_path))
loss_func = nn.NLLLoss().to(device)

model.eval()
with torch.no_grad():
correct = 0
for batch_idx, (data, target) in enumerate(test_loader):
data, target = data.to(device), target.to(device)
output = model(data).to(device)

pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()

loss = loss_func(output, target)

print(
"Performance on test data:\n\tAccuracy: {:.1f}%".format(
correct / len(test_loader) * 100
)
)
return correct / len(test_loader) * 100

Workflow

To collect the above tasks (i.e. the electrons) into a workflow, we define a “main” function (called hybrid_cudaq_workflow here) and decorate it with @ct.lattice.

@ct.lattice(workflow_executor=cpu_executor, executor=cpu_executor)
def hybrid_cudaq_workflow(
train_sample_count=140,
test_sample_count=70,
learning_rates=[0.001], # hyperparameter list to loop over
epochs=[20] # here as well
):
train_loader, test_loader = get_data(train_sample_count, test_sample_count)
plots, accuracies = [], []
for learning_rate in learning_rates:
for epoch in epochs:
model, epoch_loss = train_model(train_loader, learning_rate, epoch)
accuracy = evaluate(test_loader, model)
plot = plot_loss(epoch_loss)
plots.append((learning_rate, epoch, plot))
accuracies.append((learning_rate, epoch, accuracy))

return plots, accuracies

The workflow may then be executed by dispatching to Covalent Cloud and later fetching the result.

# Run the workflow across (2 learning rates) x (2 epoch numbers) = 4 trials.
dispatch_id = cc.dispatch(hybrid_cudaq_workflow, volume=volume)(
train_sample_count=100, learning_rates=[0.001, 0.05], epochs=[5, 20]
)

# This bit can be run at any time, provided the dispatch_id is known.
plots, accuracies = cc.get_result(dispatch_id, wait=True).result.load()

This workflow produces the following graph in the Covalent Cloud UI.

cudaq workflow transport graph

To inspect results, we can show returned loss plots in a Jupyter notebook by running e.g.

plots[1][-1]  # -> PIL.PngImagePlugin.PngImageFile object

Training Loss over time of training a hybrid quantum neural network

Conclusion

In this tutorial, we combined Covalent Cloud, CUDA Quantum, and PyTorch to train and evaluate a hybrid quantum neural network. We also discussed choosing between CPU and GPU executors, and how to store data with Covalent Cloud volumes. If you execute the provided workflow using gpu_type='l40', the total cost will be ~$0.60.

The full code can be found below.

Full Code
# requirements:
# covalent-cloud>=0.81.0
# matplotlib==3.9.2
# numpy==1.23.5
# pillow==11.0.0
# torch==2.4.1
# torchvision==0.19.1

import covalent as ct
import covalent_cloud as cc

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image
from torch.autograd import Function
from torchvision import datasets, transforms


cc.save_api_key("YOUR-API-KEY") # NOTE: Replace with your real API key
cc.create_env(
name="cuda-quantum",
pip=[
"cuda-quantum==0.8.0",
"matplotlib==3.9.1",
"torch==2.4.1",
"torchvision==0.19.1",
],
wait=True
)

volume = cc.volume("cudaq")

cpu_executor = cc.CloudExecutor(
env="cuda-quantum",
num_cpus=4,
memory="16GB",
time_limit=60*60 # 1 hour
)

gpu_executor = cc.CloudExecutor(
env="cuda-quantum",
num_gpus=1,
gpu_type="h100",
num_cpus=4,
memory="16GB",
time_limit=60*60
)


class QuantumFunction(Function):

def __init__(self, qubit_count: int, device, kernel, thetas):
"""Define the quantum circuit in CUDA Quantum"""
self.kernel = kernel
self.theta = thetas
self.device = device
qubits = kernel.qalloc(qubit_count)
self.kernel.h(qubits)
# Variational gate parameters which are optimised during training.
kernel.ry(thetas[0], qubits[0])
kernel.rx(thetas[1], qubits[0])

def run(self, thetas: torch.tensor) -> torch.tensor:
"""Execute the quantum circuit to output an expectation value"""
import cudaq

exp_ = cudaq.observe(self.kernel, cudaq.spin.z(0), thetas.tolist()).expectation()
expectation = torch.tensor(exp_, device=self.device)
return expectation

@staticmethod
def forward(ctx, thetas: torch.tensor, quantum_circuit, shift) -> torch.tensor:

# Save shift and quantum_circuit in context to use in backward.
ctx.shift = shift
ctx.quantum_circuit = quantum_circuit

# Calculate expectation value.
expectation = ctx.quantum_circuit.run(thetas)

ctx.save_for_backward(thetas, expectation)
return expectation

@staticmethod
def backward(ctx, grad_output):
"""Backward pass computation via finite difference parameter shift"""
thetas, expectation = ctx.saved_tensors

device = ctx.quantum_circuit.device

gradients = torch.zeros(len(thetas), device=device)
for i in range(len(thetas)):
shift_right = torch.clone(thetas)
shift_right[i] += ctx.shift
shift_left = torch.clone(thetas)
shift_left[i] -= ctx.shift

expectation_right = ctx.quantum_circuit.run(shift_right)
expectation_left = ctx.quantum_circuit.run(shift_left)
gradients[i] = (expectation_right -
expectation_left) / 2 * ctx.shift
return gradients * grad_output.float(), None, None


class QuantumLayer(nn.Module):
"""Encapsulates a quantum circuit and a quantum function into a quantum layer"""

def __init__(
self, qubit_count: int, shift: torch.tensor, device, kernel, thetas
):
super(QuantumLayer, self).__init__()

self.device = device
# 1 qubit quantum circuit.
self.quantum_circuit = QuantumFunction(
qubit_count, device, kernel, thetas
)
self.shift = shift

def forward(self, input):
ans = QuantumFunction.apply(input, self.quantum_circuit, self.shift)
return ans


class Net(nn.Module):

def __init__(self, device, kernel, thetas, qubit_count=1, shift=torch.tensor(np.pi / 2)):
super(Net, self).__init__()

# Neural network structure.
self.conv1 = nn.Conv2d(1, 6, kernel_size=5)
self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
self.dropout = nn.Dropout2d()
self.fc1 = nn.Linear(256, 64)
self.fc2 = nn.Linear(
64, 2
) # Output a 2D tensor since we have 2 variational parameters in our quantum circuit.
self.hybrid = QuantumLayer(
qubit_count, shift, device, kernel, thetas
) # Input is the magnitude of the parameter shifts to calculate gradients.

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = self.dropout(x)
x = x.view(1, -1)
x = F.relu(self.fc1(x))
# Reshapes required to satisfy input dimensions to CUDA Quantum.
x = self.fc2(x).reshape(-1)
x = self.hybrid(x).reshape(-1)
return torch.cat((x, 1 - x), -1).unsqueeze(0)


@ct.electron(executor=cpu_executor)
def get_data(train_sample_count, test_sample_count):
data_path = volume / "data"
X_train = datasets.MNIST(
root=data_path,
train=True,
download=True,
transform=transforms.Compose([transforms.ToTensor()]),
)
# Leaving only labels 0 and 1.
idx = np.append(
np.where(X_train.targets == 0)[0][:train_sample_count],
np.where(X_train.targets == 1)[0][:train_sample_count],
)
X_train.data = X_train.data[idx]
X_train.targets = X_train.targets[idx]
train_loader = torch.utils.data.DataLoader(X_train, batch_size=1, shuffle=True)

X_test = datasets.MNIST(
root=data_path,
train=False,
download=True,
transform=transforms.Compose([transforms.ToTensor()]),
)
idx = np.append(
np.where(X_test.targets == 0)[0][:test_sample_count],
np.where(X_test.targets == 1)[0][:test_sample_count],
)
X_test.data = X_test.data[idx]
X_test.targets = X_test.targets[idx]
test_loader = torch.utils.data.DataLoader(X_test, batch_size=1, shuffle=True)

return train_loader, test_loader


@ct.electron(executor=gpu_executor)
def train_model(train_loader, learning_rate, epochs):
import cudaq

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

kernel, thetas = cudaq.make_kernel(list)
model = Net(device, kernel, thetas).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.NLLLoss().to(device)

epochs = epochs
epoch_loss = []
model.train()

for epoch in range(epochs):
batch_loss = 0.0
for batch_idx, (data, target) in enumerate(train_loader): # Batch training.
optimizer.zero_grad()
data, target = data.to(device), target.to(device)

# Forward pass.
output = model(data).to(device)
# Calculating loss.
loss = loss_func(output, target).to(device)

# Backward pass.
loss.backward()

# Optimize the weights.
optimizer.step()
batch_loss += loss.item()

epoch_loss.append(batch_loss / batch_idx)

print("Training [{:.0f}%]\tLoss: {:.4f}".format(100.0 * (epoch + 1) / epochs, epoch_loss[-1]))

# save model
model_path = volume / "model.pth"
torch.save(model.state_dict(), model_path)

return model_path, epoch_loss


@ct.electron(executor=cpu_executor)
def plot_loss(epoch_loss):
plt.plot(epoch_loss)
plt.title("Hybrid NN Training Convergence")
plt.xlabel("Training Iterations")
plt.ylabel("Neg Log Likelihood Loss")
plt.savefig('/tmp/plot.png')
return Image.open('/tmp/plot.png')


@ct.electron(executor=gpu_executor)
def evaluate(test_loader, model_path):
import cudaq

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

kernel, thetas = cudaq.make_kernel(list)
# load model
model = Net(device, kernel, thetas).to(device)
model.load_state_dict(torch.load(model_path))
loss_func = nn.NLLLoss().to(device)

model.eval()
with torch.no_grad():
correct = 0
for batch_idx, (data, target) in enumerate(test_loader):
data, target = data.to(device), target.to(device)
output = model(data).to(device)

pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()

loss = loss_func(output, target)

print(
"Performance on test data:\n\tAccuracy: {:.1f}%".format(
correct / len(test_loader) * 100
)
)
return correct / len(test_loader) * 100


@ct.lattice(workflow_executor=cpu_executor, executor=cpu_executor)
def hybrid_cudaq_workflow(
train_sample_count=140,
test_sample_count=70,
learning_rates=[0.001], # hyperparameter list to loop over
epochs=[20] # here as well
):
train_loader, test_loader = get_data(train_sample_count, test_sample_count)
plots, accuracies = [], []
for learning_rate in learning_rates:
for epoch in epochs:
model, epoch_loss = train_model(train_loader, learning_rate, epoch)
accuracy = evaluate(test_loader, model)
plot = plot_loss(epoch_loss)
plots.append((learning_rate, epoch, plot))
accuracies.append((learning_rate, epoch, accuracy))

return plots, accuracies


if __name__ == "__main__":
# Run the workflow across (2 learning rates) x (2 epoch numbers) = 4 trials.
dispatch_id = cc.dispatch(hybrid_cudaq_workflow, volume=volume)(
train_sample_count=100, learning_rates=[0.001, 0.05], epochs=[5, 20]
)

# This bit can be run at any time, provided the dispatch_id is known.
plots, accuracies = cc.get_result(dispatch_id, wait=True).result.load()
plots[1][-1].save("./plot_1.png")