如果这篇博客帮助到你,可以请我喝一杯咖啡~
CC BY 4.0 (除特别声明或转载文章外)
我们的计算机图形学小组 Project2 选题项目~
Taichi是一种用于计算机图形应用程序的高性能编程语言。
- 性能
- 生产率
- 稀疏计算
- 可微编程
- 元编程
开发环境
硬件
所用机器型号为 VAIO Z Flip 2016
- Intel(R) Core(TM) i7-6567U CPU @3.30GHZ 3.31GHz
- 8.00GB RAM
软件
- Windows 10, 64-bit (Build 17763) 10.0.17763
- Visual Studio Code 1.39.2
- Python 2019.10.41019:九月底发布的 VSCode Python 插件支持在编辑器窗口内原生运行 juyter nootbook 了,非常赞!
- Remote - WSL 0.39.9:配合 WSL,在 Windows 上获得 Linux 接近原生环境的体验。
- Windows Subsystem for Linux [Ubuntu 18.04.2 LTS]:WSL 是以软件的形式运行在 Windows 下的 Linux 子系统,是近些年微软推出来的新工具,可以在 Windows 系统上原生运行 Linux。
- Python 3.7.4 64-bit (‘anaconda3’:virtualenv):安装在 WSL 中。
taichi 库环境配置
出于效率的考虑,太極本身是由 C++ 构建的,但用 Python 包装了接口。
# 安装openCV
python3 -m pip install opencv-python
# CPU only. No GPU/CUDA needed. (Linux, OS X and Windows)
python3 -m pip install taichi-nightly==0.3.16
# With GPU (CUDA 10.0) support (Linux only)
# python3 -m pip install taichi-nightly-cuda-10-0
# With GPU (CUDA 10.1) support (Linux only)
# python3 -m pip install taichi-nightly-cuda-10-1
这里我本地机器所用的显卡是 Intel(R) Iris(TM) 550,本身不具备 CUDA 环境,因此安装了 CPU ONLY 版本。同时由于 taichi 每天都在更新,这里仅保证代码在我们所安装的版本(这篇文章完成于 2019-12-31,此时的最新版本)上能够跑通。
运行官方示例文件
可以执行下述代码获取官方 repo 中的一些示例文件。
svn checkout https://github.com/yuanming-hu/taichi/trunk/examples
taichi_logo.py
下面这份代码来自于https://github.com/yuanming-hu/taichi/blob/master/examples/taichi_logo.py
# TODO: this program crashes clang-7 when compiled with ti.x86_64. Why?
import taichi as ti
import cv2
import numpy as np
def Vector2(x, y):
return ti.Vector([x, y])
@ti.func
def inside(p, c, r):
return (p - c).norm_sqr() <= r * r
@ti.func
def inside_taichi(p_):
p = p_
p = Vector2(0.5, 0.5) + (p - Vector2(0.5, 0.5)) * 1.11
ret = -1
if not inside(p, Vector2(0.50, 0.50), 0.55):
if ret == -1:
ret = 0
if not inside(p, Vector2(0.50, 0.50), 0.50):
if ret == -1:
ret = 1
if inside(p, Vector2(0.50, 0.25), 0.09):
if ret == -1:
ret = 1
if inside(p, Vector2(0.50, 0.75), 0.09):
if ret == -1:
ret = 0
if inside(p, Vector2(0.50, 0.25), 0.25):
if ret == -1:
ret = 0
if inside(p, Vector2(0.50, 0.75), 0.25):
if ret == -1:
ret = 1
if p[0] < 0.5:
if ret == -1:
ret = 1
else:
if ret == -1:
ret = 0
return ret
x = ti.var(ti.f32)
n = 512
ti.cfg.use_llvm = True
@ti.layout
def layout():
ti.root.dense(ti.ij, n).place(x)
@ti.kernel
def paint():
for i in range(n * 4):
for j in range(n * 4):
ret = 1.0 - \
inside_taichi(Vector2(1.0 * i / n / 4, 1.0 * j / n / 4))
x[i // 4, j // 4] += ret / 16
paint()
img = np.empty((n, n), dtype=np.float32)
for i in range(n):
for j in range(n):
img[i, j] = x[j, n - 1 - i]
cv2.imshow('Taichi', img)
cv2.waitKey(0)
运行上述代码,结果如下:
可以看到,这里成功画出了一个太极图,说明本地配置没有问题。
mpm88.py
上面这个太极图是静态的,我们再来看一下动态画面实时渲染的效果。
下面这份代码来自于https://github.com/yuanming-hu/taichi/blob/master/examples/mpm88.py。
import taichi as ti
import random
dim = 2
n_particles = 8192
n_grid = 128
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 2.0e-4
p_vol = (dx * 0.5) ** 2
p_rho = 1
p_mass = p_vol * p_rho
E = 400
x = ti.Vector(dim, dt=ti.f32, shape=n_particles)
v = ti.Vector(dim, dt=ti.f32, shape=n_particles)
C = ti.Matrix(dim, dim, dt=ti.f32, shape=n_particles)
J = ti.var(dt=ti.f32, shape=n_particles)
grid_v = ti.Vector(dim, dt=ti.f32, shape=(n_grid, n_grid))
grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid))
ti.cfg.arch = ti.cuda
@ti.kernel
def substep():
for p in x:
base = (x[p] * inv_dx - 0.5).cast(int)
fx = x[p] * inv_dx - base.cast(float)
w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1),
0.5 * ti.sqr(fx - 0.5)]
stress = -dt * p_vol * (J[p] - 1) * 4 * inv_dx * inv_dx * E
affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
for i in ti.static(range(3)):
for j in ti.static(range(3)):
offset = ti.Vector([i, j])
dpos = (offset.cast(float) - fx) * dx
weight = w[i][0] * w[j][1]
grid_v[base +
offset].atomic_add(weight * (p_mass * v[p] + affine @ dpos))
grid_m[base + offset].atomic_add(weight * p_mass)
for i, j in grid_m:
if grid_m[i, j] > 0:
bound = 3
inv_m = 1 / grid_m[i, j]
grid_v[i, j] = inv_m * grid_v[i, j]
grid_v[i, j][1] -= dt * 9.8
if i < bound and grid_v[i, j][0] < 0:
grid_v[i, j][0] = 0
if i > n_grid - bound and grid_v[i, j][0] > 0:
grid_v[i, j][0] = 0
if j < bound and grid_v[i, j][1] < 0:
grid_v[i, j][1] = 0
if j > n_grid - bound and grid_v[i, j][1] > 0:
grid_v[i, j][1] = 0
for p in x:
base = (x[p] * inv_dx - 0.5).cast(int)
fx = x[p] * inv_dx - base.cast(float)
w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1.0),
0.5 * ti.sqr(fx - 0.5)]
new_v = ti.Vector.zero(ti.f32, 2)
new_C = ti.Matrix.zero(ti.f32, 2, 2)
for i in ti.static(range(3)):
for j in ti.static(range(3)):
dpos = ti.Vector([i, j]).cast(float) - fx
g_v = grid_v[base + ti.Vector([i, j])]
weight = w[i][0] * w[j][1]
new_v += weight * g_v
new_C += 4 * weight * ti.outer_product(g_v, dpos) * inv_dx
v[p] = new_v
x[p] += dt * v[p]
J[p] *= 1 + dt * new_C.trace()
C[p] = new_C
gui = ti.core.GUI("MPM99", ti.veci(512, 512))
canvas = gui.get_canvas()
for i in range(n_particles):
x[i] = [random.random() * 0.4 + 0.2, random.random() * 0.4 + 0.2]
v[i] = [0, -1]
J[i] = 1
for frame in range(200):
for s in range(50):
grid_v.fill([0, 0])
grid_m.fill(0)
substep()
canvas.clear(0x112F41)
pos = x.to_numpy(as_vector=True)
for i in range(n_particles):
canvas.circle(ti.vec(pos[i, 0], pos[i, 1])).radius(
1.5).color(0x068587).finish()
gui.update()
运行上述代码,效果如下:
模拟了一杯「粒子」水晃动的情况,由于代码是运行在 CPU 上的,而对大量粒子(8192 个)的模拟仿真非常吃多线程算力,因此这里我跑出来的帧率比较低。
wave.py
最后来看一个比较炫酷的例子。
import taichi as ti
import math
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
real = ti.f32
ti.set_default_fp(real)
# ti.runtime.print_preprocessed = True
n_grid = 256
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 3e-4
max_steps = 512
vis_interval = 32
output_vis_interval = 2
steps = 256
assert steps * 2 <= max_steps
amplify = 1
scalar = lambda: ti.var(dt=real)
vec = lambda: ti.Vector(2, dt=real)
p = scalar()
target = scalar()
initial = scalar()
loss = scalar()
ti.cfg.arch = ti.cuda
@ti.layout
def place():
ti.root.dense(ti.l, max_steps).dense(ti.ij, n_grid).place(p)
ti.root.dense(ti.l, max_steps).dense(ti.ij, n_grid).place(p.grad)
ti.root.dense(ti.ij, n_grid).place(target)
ti.root.dense(ti.ij, n_grid).place(target.grad)
ti.root.dense(ti.ij, n_grid).place(initial)
ti.root.dense(ti.ij, n_grid).place(initial.grad)
ti.root.place(loss)
ti.root.place(loss.grad)
c = 340
# damping
alpha = 0.00000
inv_dx2 = inv_dx * inv_dx
dt = (math.sqrt(alpha * alpha + dx * dx / 3) - alpha) / c
learning_rate = 1
# TODO: there may by out-of-bound accesses here
@ti.func
def laplacian(t, i, j):
return inv_dx2 * (
-4 * p[t, i, j] + p[t, i, j - 1] + p[t, i, j + 1] + p[t, i + 1, j] +
p[t, i - 1, j])
@ti.kernel
def initialize():
for i in range(n_grid):
for j in range(n_grid):
p[0, i, j] = initial[i, j]
@ti.kernel
def fdtd(t: ti.i32):
for i in range(n_grid): # Parallelized over GPU threads
for j in range(n_grid):
laplacian_p = laplacian(t - 2, i, j)
laplacian_q = laplacian(t - 1, i, j)
p[t, i, j] = 2 * p[t - 1, i, j] + (
c * c * dt * dt + c * alpha * dt) * laplacian_q - p[
t - 2, i, j] - c * alpha * dt * laplacian_p
@ti.kernel
def compute_loss(t: ti.i32):
for i in range(n_grid):
for j in range(n_grid):
ti.atomic_add(loss, dx * dx * ti.sqr(target[i, j] - p[t, i, j]))
@ti.kernel
def apply_grad():
# gradient descent
for i, j in initial.grad:
initial[i, j] -= learning_rate * initial.grad[i, j]
def forward(output=None):
steps_mul = 1
interval = vis_interval
if output:
os.makedirs(output, exist_ok=True)
steps_mul = 2
interval = output_vis_interval
initialize()
for t in range(2, steps * steps_mul):
fdtd(t)
if (t + 1) % interval == 0:
img = np.zeros(shape=(n_grid, n_grid), dtype=np.float32)
for i in range(n_grid):
for j in range(n_grid): img[i, j] = p[t, i, j] * amplify + 0.5
img = cv2.resize(img, fx=4, fy=4, dsize=None)
cv2.imshow('img', img)
cv2.waitKey(1)
if output:
img = np.clip(img, 0, 255)
cv2.imwrite(output + "/{:04d}.png".format(t), img * 255)
compute_loss(steps - 1)
def main():
# initialization
target_img = cv2.imread('taichi.png')[:,:,0] / 255.0
target_img -= target_img.mean()
target_img = cv2.resize(target_img, (n_grid, n_grid))
cv2.imshow('target', target_img * amplify + 0.5)
# print(target_img.min(), target_img.max())
for i in range(n_grid):
for j in range(n_grid):
target[i, j] = float(target_img[i, j])
if False:
# this is not too exciting...
initial[n_grid // 2, n_grid // 2] = -2
forward('center')
initial[n_grid // 2, n_grid // 2] = 0
for opt in range(200):
with ti.Tape(loss):
output = None
if opt % 20 == 19:
output = 'wave/iter{:03d}/'.format(opt)
forward(output)
print('Iter', opt, ' Loss =', loss[None])
apply_grad()
forward('optimized')
if __name__ == '__main__':
main()
算法一共迭代了 200 步,每 20 步本地保存一次。执行下述代码wave_rander.py
,将渲染出的图片合成为视频。
import os
if __name__ == '__main__':
for i in range(19, 200, 20):
name = "iter%03d" % (i)
for j in range(3, 512, 2):
inputfile = name+"/%04d.png" % (j)
outputfile = name+"/%04d.png" % (j//2)
os.system("mv %s %s" % (inputfile, outputfile))
os.system("ffmpeg %s.mp4 -i '%s/%%04d.png' -start_number 0001 -vf fps=5 -r 5 -pattern_type glob" % (name, name))
结果保存在wave/
目录下,iter019.mp4
~iter199.mp4
分别保存了 19 次迭代后的结果。
iter019 | iter059 | iter199 |
---|---|---|
可以看到,经过多次迭代计算后,程序对水波的仿真效果越来越逼真。
自己尝试
sysu.py
Update:在更新到最新的 v0.3.6 之后问题就解决了(之前用的 v0.2.3)
使用粒子画了一个旋转的 SYSU。注意由于计算精度的问题(推测),用于旋转计算元的单位向量实际模长比 1 略小,这就导致了我们的 SYSU 是一边旋转一边放缩的。
import taichi as ti
def getX():
logo = [[
" #### ",
" ## ## ",
" # # ",
" # ",
" # ",
" ## ",
" ## ",
" # # ",
" # # ",
" ## ## ",
" #### "
], [
" # # ",
" # # ",
" # # ",
" # # ",
" ## ",
" ## ",
" ## ",
" ## ",
" ## ",
" ## ",
" ## "
], [
" #### ",
" ## ## ",
" # # ",
" # ",
" # ",
" ## ",
" ## ",
" # # ",
" # # ",
" ## ## ",
" #### "
], [
" # # ",
" # # ",
" # # ",
" # # ",
" # # ",
" # # ",
" # # ",
" # # ",
" # # ",
" ## ## ",
" #### "
]]
x = []
for i in range(len(logo)):
for X in range(len(logo[i])):
for Y in range(len(logo[i][X])):
if logo[i][X][Y] == '#':
x.append([i/len(logo)+Y/len(logo[i][X])/len(logo),
0.5+0.5/len(logo)-X/len(logo[i])/len(logo)])
return x
ti.cfg.arch = ti.cuda
x_logo = getX()
n_particles = len(x_logo)
dim = 2
spin = [ti.cos(0.1), ti.sin(0.1)]
x = ti.Vector(dim, dt=ti.f32, shape=n_particles)
for i in range(n_particles):
x[i] = x_logo[i]
@ti.func
def complexMul(a: ti.f32, b: ti.f32, c: ti.f32, d: ti.f32): # 复数乘法,用于旋转
return [a*c-b*d, b*c+a*d]
@ti.kernel
def substep():
for p in x:
x[p] = x[p]-0.5
x[p] = complexMul(x[p][0], x[p][1], spin[0], spin[1])
x[p] = x[p]+0.5
gui = ti.core.GUI("SYSU_CG_COURSE", ti.veci(512, 512))
canvas = gui.get_canvas()
for frame in range(19260817):
substep()
canvas.clear(0)
pos = x.to_numpy(as_vector=True)
for i in range(n_particles):
canvas.circle(ti.vec(pos[i, 0], pos[i, 1])).radius(
7).color(87*256+33).finish() # 中大绿
gui.update()
运行效果如下图,可以看到粒子数减少之后是非常丝滑的。
Polyhedron.py
最后我们使用 taichi 重新实现了OpenGL 实现简单的多边形网格数据读取和操作
from OpenGL.GL import *
from OpenGL.GLU import *
from OpenGL.GLUT import *
import taichi as ti
ti.cfg.arch = ti.cuda # Run on GPU by default
class Polyhedron:
def __init__(self, obj=None, ply=None, off=None):
self.v = []
self.faces = []
if obj != None:
for line in open(obj, 'r').readlines():
s = line.strip().split()
if (s[0] == "v"):
self.v.append([float(s[1]), float(s[2]), float(s[3])])
elif (s[0] == "f"):
# OBJ下标从1开始的,这里做转换
self.faces.append([int(s[1])-1, int(s[2])-1, int(s[3])-1])
model = Polyhedron(obj='cow.obj')
@ti.data_oriented
class Coord3Ds:
def __init__(self, n):
self.n = n
self.v = ti.Vector(3, dt=ti.f32)
def place(self, root):
root.dense(ti.i, self.n).place(self.v)
@ti.classkernel
def assign(self, v: ti.template()):
for i in self.v:
self.v[i] = v[i]
@ti.classkernel
def output_to(self, v: ti.template()):
for i in self.v:
v[i] = self.v[i]
@ti.classkernel
def rotate(self, c: ti.template(), d: ti.template()):
for i in self.v:
self.v[i][0], self.v[i][2] = self.v[i][0]*c - \
self.v[i][2] * d, self.v[i][2]*c+self.v[i][0]*d
@ti.classkernel
def normal(self): # 归一化
ma = self.v[0][0]
mi = self.v[0][0]
for i in self.v:
for j in ti.static(range(3)):
ma = max(ma, self.v[i][j])
mi = min(mi, self.v[i][j])
for i in self.v:
for j in ti.static(range(3)):
self.v[i][j] = (self.v[i][j] - mi) / (ma - mi)*2-1
global_v = Coord3Ds(len(model.v))
@ti.layout
def place():
# Place an object. Make sure you defined place for that obj
ti.root.place(global_v)
ti.root.lazy_grad()
v = ti.Vector(3, dt=ti.f32, shape=len(model.v))
for i in range(len(model.v)):
v[i] = model.v[i]
global_v.assign(v)
global_v.normal()
global_v.output_to(v)
typeNum = 0
def keyboard(key, w, h):
global typeNum
if (key == b'r'):
global_v.rotate(ti.cos(0.1), ti.sin(0.1))
global_v.output_to(v)
elif (key == b'e'):
typeNum = (typeNum + 1) % 3
else:
return
glutPostRedisplay()
def display():
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
glMatrixMode(GL_MODELVIEW)
glLoadIdentity()
for i in range(len(model.faces)):
p0 = v[model.faces[i][0]]
p1 = v[model.faces[i][1]]
p2 = v[model.faces[i][2]]
if (typeNum != 1):
glBegin(GL_LINES)
glColor3f(1, 1, 1)
glVertex3f(p0[0], p0[1], p0[2])
glVertex3f(p1[0], p1[1], p1[2])
glVertex3f(p0[0], p0[1], p0[2])
glVertex3f(p2[0], p2[1], p2[2])
glVertex3f(p1[0], p1[1], p1[2])
glVertex3f(p2[0], p2[1], p2[2])
glEnd()
if (typeNum != 2):
glBegin(GL_TRIANGLES)
glColor3f(0, 1, 1)
glVertex3f(p0[0], p0[1], p0[2])
glColor3f(1, 0, 1)
glVertex3f(p1[0], p1[1], p1[2])
glColor3f(1, 1, 0)
glVertex3f(p2[0], p2[1], p2[2])
glEnd()
glFlush()
if __name__ == "__main__":
glutInit()
glutInitDisplayMode(GLUT_RGB | GLUT_SINGLE)
glutInitWindowSize(600, 600)
glutInitWindowPosition(100, 100)
glutCreateWindow("CG_PRJ2")
glClearColor(0, 0, 0, 0)
glShadeModel(GL_SMOOTH)
glutDisplayFunc(display)
glutKeyboardFunc(keyboard)
glutMainLoop()
运行效果如下图,还是非常不错的。