taichi 库探索 wu-kan

我们的计算机图形学小组 Project2 选题项目~

Taichi是一种用于计算机图形应用程序的高性能编程语言。

  • 性能
  • 生产率
  • 稀疏计算
  • 可微编程
  • 元编程

开发环境

硬件

所用机器型号为 VAIO Z Flip 2016

  • Intel(R) Core(TM) i7-6567U CPU @3.30GHZ 3.31GHz
  • 8.00GB RAM

软件

  • Windows 10, 64-bit (Build 17763) 10.0.17763
  • Visual Studio Code 1.39.2
    • Python 2019.10.41019:九月底发布的 VSCode Python 插件支持在编辑器窗口内原生运行 juyter nootbook 了,非常赞!
    • Remote - WSL 0.39.9:配合 WSL,在 Windows 上获得 Linux 接近原生环境的体验。
  • Windows Subsystem for Linux [Ubuntu 18.04.2 LTS]:WSL 是以软件的形式运行在 Windows 下的 Linux 子系统,是近些年微软推出来的新工具,可以在 Windows 系统上原生运行 Linux。
    • Python 3.7.4 64-bit (‘anaconda3’:virtualenv):安装在 WSL 中。

taichi 库环境配置

出于效率的考虑,太極本身是由 C++ 构建的,但用 Python 包装了接口。

# 安装openCV
python3 -m pip install opencv-python

# CPU only. No GPU/CUDA needed. (Linux, OS X and Windows)
python3 -m pip install taichi-nightly==0.3.16

# With GPU (CUDA 10.0) support (Linux only)
# python3 -m pip install taichi-nightly-cuda-10-0

# With GPU (CUDA 10.1) support (Linux only)
# python3 -m pip install taichi-nightly-cuda-10-1

这里我本地机器所用的显卡是 Intel(R) Iris(TM) 550,本身不具备 CUDA 环境,因此安装了 CPU ONLY 版本。同时由于 taichi 每天都在更新,这里仅保证代码在我们所安装的版本(这篇文章完成于 2019-12-31,此时的最新版本)上能够跑通。

运行官方示例文件

可以执行下述代码获取官方 repo 中的一些示例文件。

svn checkout https://github.com/yuanming-hu/taichi/trunk/examples

taichi_logo.py

下面这份代码来自于https://github.com/yuanming-hu/taichi/blob/master/examples/taichi_logo.py

# TODO: this program crashes clang-7 when compiled with ti.x86_64. Why?
import taichi as ti
import cv2
import numpy as np


def Vector2(x, y):
    return ti.Vector([x, y])


@ti.func
def inside(p, c, r):
    return (p - c).norm_sqr() <= r * r


@ti.func
def inside_taichi(p_):
    p = p_
    p = Vector2(0.5, 0.5) + (p - Vector2(0.5, 0.5)) * 1.11
    ret = -1
    if not inside(p, Vector2(0.50, 0.50), 0.55):
        if ret == -1:
            ret = 0
    if not inside(p, Vector2(0.50, 0.50), 0.50):
        if ret == -1:
            ret = 1
    if inside(p, Vector2(0.50, 0.25), 0.09):
        if ret == -1:
            ret = 1
    if inside(p, Vector2(0.50, 0.75), 0.09):
        if ret == -1:
            ret = 0
    if inside(p, Vector2(0.50, 0.25), 0.25):
        if ret == -1:
            ret = 0
    if inside(p, Vector2(0.50, 0.75), 0.25):
        if ret == -1:
            ret = 1
    if p[0] < 0.5:
        if ret == -1:
            ret = 1
    else:
        if ret == -1:
            ret = 0
    return ret


x = ti.var(ti.f32)

n = 512
ti.cfg.use_llvm = True


@ti.layout
def layout():
    ti.root.dense(ti.ij, n).place(x)


@ti.kernel
def paint():
    for i in range(n * 4):
        for j in range(n * 4):
            ret = 1.0 - \
                inside_taichi(Vector2(1.0 * i / n / 4, 1.0 * j / n / 4))
            x[i // 4, j // 4] += ret / 16


paint()

img = np.empty((n, n), dtype=np.float32)
for i in range(n):
    for j in range(n):
        img[i, j] = x[j, n - 1 - i]

cv2.imshow('Taichi', img)
cv2.waitKey(0)

运行上述代码,结果如下:

taichi_logo

可以看到,这里成功画出了一个太极图,说明本地配置没有问题。

mpm88.py

上面这个太极图是静态的,我们再来看一下动态画面实时渲染的效果。

下面这份代码来自于https://github.com/yuanming-hu/taichi/blob/master/examples/mpm88.py

import taichi as ti
import random

dim = 2
n_particles = 8192
n_grid = 128
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 2.0e-4
p_vol = (dx * 0.5) ** 2
p_rho = 1
p_mass = p_vol * p_rho
E = 400

x = ti.Vector(dim, dt=ti.f32, shape=n_particles)
v = ti.Vector(dim, dt=ti.f32, shape=n_particles)
C = ti.Matrix(dim, dim, dt=ti.f32, shape=n_particles)
J = ti.var(dt=ti.f32, shape=n_particles)
grid_v = ti.Vector(dim, dt=ti.f32, shape=(n_grid, n_grid))
grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid))

ti.cfg.arch = ti.cuda


@ti.kernel
def substep():
    for p in x:
        base = (x[p] * inv_dx - 0.5).cast(int)
        fx = x[p] * inv_dx - base.cast(float)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1),
             0.5 * ti.sqr(fx - 0.5)]
        stress = -dt * p_vol * (J[p] - 1) * 4 * inv_dx * inv_dx * E
        affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                offset = ti.Vector([i, j])
                dpos = (offset.cast(float) - fx) * dx
                weight = w[i][0] * w[j][1]
                grid_v[base +
                       offset].atomic_add(weight * (p_mass * v[p] + affine @ dpos))
                grid_m[base + offset].atomic_add(weight * p_mass)

    for i, j in grid_m:
        if grid_m[i, j] > 0:
            bound = 3
            inv_m = 1 / grid_m[i, j]
            grid_v[i, j] = inv_m * grid_v[i, j]
            grid_v[i, j][1] -= dt * 9.8
            if i < bound and grid_v[i, j][0] < 0:
                grid_v[i, j][0] = 0
            if i > n_grid - bound and grid_v[i, j][0] > 0:
                grid_v[i, j][0] = 0
            if j < bound and grid_v[i, j][1] < 0:
                grid_v[i, j][1] = 0
            if j > n_grid - bound and grid_v[i, j][1] > 0:
                grid_v[i, j][1] = 0

    for p in x:
        base = (x[p] * inv_dx - 0.5).cast(int)
        fx = x[p] * inv_dx - base.cast(float)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1.0),
             0.5 * ti.sqr(fx - 0.5)]
        new_v = ti.Vector.zero(ti.f32, 2)
        new_C = ti.Matrix.zero(ti.f32, 2, 2)
        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                dpos = ti.Vector([i, j]).cast(float) - fx
                g_v = grid_v[base + ti.Vector([i, j])]
                weight = w[i][0] * w[j][1]
                new_v += weight * g_v
                new_C += 4 * weight * ti.outer_product(g_v, dpos) * inv_dx
        v[p] = new_v
        x[p] += dt * v[p]
        J[p] *= 1 + dt * new_C.trace()
        C[p] = new_C


gui = ti.core.GUI("MPM99", ti.veci(512, 512))
canvas = gui.get_canvas()

for i in range(n_particles):
    x[i] = [random.random() * 0.4 + 0.2, random.random() * 0.4 + 0.2]
    v[i] = [0, -1]
    J[i] = 1

for frame in range(200):
    for s in range(50):
        grid_v.fill([0, 0])
        grid_m.fill(0)
        substep()

    canvas.clear(0x112F41)
    pos = x.to_numpy(as_vector=True)
    for i in range(n_particles):
        canvas.circle(ti.vec(pos[i, 0], pos[i, 1])).radius(
            1.5).color(0x068587).finish()
    gui.update()

运行上述代码,效果如下:

mpm88

模拟了一杯「粒子」水晃动的情况,由于代码是运行在 CPU 上的,而对大量粒子(8192 个)的模拟仿真非常吃多线程算力,因此这里我跑出来的帧率比较低。

wave.py

最后来看一个比较炫酷的例子。

wave

在原作者的这个 Repo里面找到了这段代码

import taichi as ti
import math
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt

real = ti.f32
ti.set_default_fp(real)
# ti.runtime.print_preprocessed = True

n_grid = 256
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 3e-4
max_steps = 512
vis_interval = 32
output_vis_interval = 2
steps = 256
assert steps * 2 <= max_steps
amplify = 1

scalar = lambda: ti.var(dt=real)
vec = lambda: ti.Vector(2, dt=real)

p = scalar()
target = scalar()
initial = scalar()
loss = scalar()

ti.cfg.arch = ti.cuda

@ti.layout
def place():
  ti.root.dense(ti.l, max_steps).dense(ti.ij, n_grid).place(p)
  ti.root.dense(ti.l, max_steps).dense(ti.ij, n_grid).place(p.grad)
  ti.root.dense(ti.ij, n_grid).place(target)
  ti.root.dense(ti.ij, n_grid).place(target.grad)
  ti.root.dense(ti.ij, n_grid).place(initial)
  ti.root.dense(ti.ij, n_grid).place(initial.grad)
  ti.root.place(loss)
  ti.root.place(loss.grad)


c = 340
# damping
alpha = 0.00000
inv_dx2 = inv_dx * inv_dx
dt = (math.sqrt(alpha * alpha + dx * dx / 3) - alpha) / c
learning_rate = 1


# TODO: there may by out-of-bound accesses here
@ti.func
def laplacian(t, i, j):
  return inv_dx2 * (
      -4 * p[t, i, j] + p[t, i, j - 1] + p[t, i, j + 1] + p[t, i + 1, j] +
      p[t, i - 1, j])

@ti.kernel
def initialize():
  for i in range(n_grid):
    for j in range(n_grid):
      p[0, i, j] = initial[i, j]


@ti.kernel
def fdtd(t: ti.i32):
  for i in range(n_grid): # Parallelized over GPU threads
    for j in range(n_grid):
      laplacian_p = laplacian(t - 2, i, j)
      laplacian_q = laplacian(t - 1, i, j)
      p[t, i, j] = 2 * p[t - 1, i, j] + (
          c * c * dt * dt + c * alpha * dt) * laplacian_q - p[
                     t - 2, i, j] - c * alpha * dt * laplacian_p

@ti.kernel
def compute_loss(t: ti.i32):
  for i in range(n_grid):
    for j in range(n_grid):
      ti.atomic_add(loss, dx * dx * ti.sqr(target[i, j] - p[t, i, j]))

@ti.kernel
def apply_grad():
  # gradient descent
  for i, j in initial.grad:
    initial[i, j] -= learning_rate * initial.grad[i, j]

def forward(output=None):
  steps_mul = 1
  interval = vis_interval
  if output:
    os.makedirs(output, exist_ok=True)
    steps_mul = 2
    interval = output_vis_interval
  initialize()
  for t in range(2, steps * steps_mul):
    fdtd(t)
    if (t + 1) % interval == 0:
      img = np.zeros(shape=(n_grid, n_grid), dtype=np.float32)
      for i in range(n_grid):
        for j in range(n_grid): img[i, j] = p[t, i, j] * amplify + 0.5
      img = cv2.resize(img, fx=4, fy=4, dsize=None)
      cv2.imshow('img', img)
      cv2.waitKey(1)
      if output:
        img = np.clip(img, 0, 255)
        cv2.imwrite(output + "/{:04d}.png".format(t), img * 255)
  compute_loss(steps - 1)

def main():
  # initialization
  target_img = cv2.imread('taichi.png')[:,:,0] / 255.0
  target_img -= target_img.mean()
  target_img = cv2.resize(target_img, (n_grid, n_grid))
  cv2.imshow('target', target_img * amplify + 0.5)
  # print(target_img.min(), target_img.max())
  for i in range(n_grid):
    for j in range(n_grid):
      target[i, j] = float(target_img[i, j])

  if False:
    # this is not too exciting...
    initial[n_grid // 2, n_grid // 2] = -2
    forward('center')
    initial[n_grid // 2, n_grid // 2] = 0

  for opt in range(200):
    with ti.Tape(loss):
      output = None
      if opt % 20 == 19:
        output = 'wave/iter{:03d}/'.format(opt)
      forward(output)

    print('Iter', opt, ' Loss =', loss[None])

    apply_grad()

  forward('optimized')

if __name__ == '__main__':
  main()

算法一共迭代了 200 步,每 20 步本地保存一次。执行下述代码wave_rander.py,将渲染出的图片合成为视频。

import os

if __name__ == '__main__':
    for i in range(19, 200, 20):
        name = "iter%03d" % (i)
        for j in range(3, 512, 2):
            inputfile = name+"/%04d.png" % (j)
            outputfile = name+"/%04d.png" % (j//2)
            os.system("mv %s %s" % (inputfile, outputfile))
        os.system("ffmpeg %s.mp4 -i '%s/%%04d.png' -start_number 0001 -vf fps=5 -r 5 -pattern_type glob" % (name, name))

结果保存在wave/目录下,iter019.mp4~iter199.mp4分别保存了 19 次迭代后的结果。

iter019iter059iter199
iter019iter059iter199

可以看到,经过多次迭代计算后,程序对水波的仿真效果越来越逼真。

自己尝试

sysu.py

Update:在更新到最新的 v0.3.6 之后问题就解决了(之前用的 v0.2.3)

使用粒子画了一个旋转的 SYSU。注意由于计算精度的问题(推测),用于旋转计算元的单位向量实际模长比 1 略小,这就导致了我们的 SYSU 是一边旋转一边放缩的。

import taichi as ti


def getX():
    logo = [[
        "  ####  ",
        " ##  ## ",
        " #    # ",
        " #      ",
        "  #     ",
        "   ##   ",
        "     ## ",
        " #    # ",
        " #    # ",
        " ##  ## ",
        "  ####  "
    ], [
        " #    # ",
        " #    # ",
        "  #  #  ",
        "  #  #  ",
        "   ##   ",
        "   ##   ",
        "   ##   ",
        "   ##   ",
        "   ##   ",
        "   ##   ",
        "   ##   "
    ], [
        "  ####  ",
        " ##  ## ",
        " #    # ",
        " #      ",
        "  #     ",
        "   ##   ",
        "     ## ",
        " #    # ",
        " #    # ",
        " ##  ## ",
        "  ####  "
    ], [
        " #    # ",
        " #    # ",
        " #    # ",
        " #    # ",
        " #    # ",
        " #    # ",
        " #    # ",
        " #    # ",
        " #    # ",
        " ##  ## ",
        "  ####  "
    ]]
    x = []
    for i in range(len(logo)):
        for X in range(len(logo[i])):
            for Y in range(len(logo[i][X])):
                if logo[i][X][Y] == '#':
                    x.append([i/len(logo)+Y/len(logo[i][X])/len(logo),
                              0.5+0.5/len(logo)-X/len(logo[i])/len(logo)])
    return x


ti.cfg.arch = ti.cuda
x_logo = getX()
n_particles = len(x_logo)
dim = 2
spin = [ti.cos(0.1), ti.sin(0.1)]
x = ti.Vector(dim, dt=ti.f32, shape=n_particles)
for i in range(n_particles):
    x[i] = x_logo[i]


@ti.func
def complexMul(a: ti.f32, b: ti.f32, c: ti.f32, d: ti.f32):  # 复数乘法,用于旋转
    return [a*c-b*d, b*c+a*d]


@ti.kernel
def substep():
    for p in x:
        x[p] = x[p]-0.5
        x[p] = complexMul(x[p][0], x[p][1], spin[0], spin[1])
        x[p] = x[p]+0.5


gui = ti.core.GUI("SYSU_CG_COURSE", ti.veci(512, 512))
canvas = gui.get_canvas()
for frame in range(19260817):
    substep()

    canvas.clear(0)
    pos = x.to_numpy(as_vector=True)
    for i in range(n_particles):
        canvas.circle(ti.vec(pos[i, 0], pos[i, 1])).radius(
            7).color(87*256+33).finish()  # 中大绿
    gui.update()

运行效果如下图,可以看到粒子数减少之后是非常丝滑的。

sysu

Polyhedron.py

最后我们使用 taichi 重新实现了OpenGL 实现简单的多边形网格数据读取和操作

from OpenGL.GL import *
from OpenGL.GLU import *
from OpenGL.GLUT import *
import taichi as ti
ti.cfg.arch = ti.cuda  # Run on GPU by default


class Polyhedron:
    def __init__(self, obj=None, ply=None, off=None):
        self.v = []
        self.faces = []
        if obj != None:
            for line in open(obj, 'r').readlines():
                s = line.strip().split()
                if (s[0] == "v"):
                    self.v.append([float(s[1]), float(s[2]), float(s[3])])
                elif (s[0] == "f"):
                    # OBJ下标从1开始的,这里做转换
                    self.faces.append([int(s[1])-1, int(s[2])-1, int(s[3])-1])


model = Polyhedron(obj='cow.obj')


@ti.data_oriented
class Coord3Ds:
    def __init__(self, n):
        self.n = n
        self.v = ti.Vector(3, dt=ti.f32)

    def place(self, root):
        root.dense(ti.i, self.n).place(self.v)

    @ti.classkernel
    def assign(self, v: ti.template()):
        for i in self.v:
            self.v[i] = v[i]

    @ti.classkernel
    def output_to(self, v: ti.template()):
        for i in self.v:
            v[i] = self.v[i]

    @ti.classkernel
    def rotate(self, c: ti.template(), d: ti.template()):
        for i in self.v:
            self.v[i][0], self.v[i][2] = self.v[i][0]*c - \
                self.v[i][2] * d,  self.v[i][2]*c+self.v[i][0]*d

    @ti.classkernel
    def normal(self):  # 归一化
        ma = self.v[0][0]
        mi = self.v[0][0]
        for i in self.v:
            for j in ti.static(range(3)):
                ma = max(ma, self.v[i][j])
                mi = min(mi, self.v[i][j])
        for i in self.v:
            for j in ti.static(range(3)):
                self.v[i][j] = (self.v[i][j] - mi) / (ma - mi)*2-1


global_v = Coord3Ds(len(model.v))


@ti.layout
def place():
    # Place an object. Make sure you defined place for that obj
    ti.root.place(global_v)
    ti.root.lazy_grad()


v = ti.Vector(3, dt=ti.f32, shape=len(model.v))
for i in range(len(model.v)):
    v[i] = model.v[i]

global_v.assign(v)
global_v.normal()
global_v.output_to(v)
typeNum = 0


def keyboard(key, w, h):
    global typeNum
    if (key == b'r'):
        global_v.rotate(ti.cos(0.1), ti.sin(0.1))
        global_v.output_to(v)
    elif (key == b'e'):
        typeNum = (typeNum + 1) % 3
    else:
        return
    glutPostRedisplay()


def display():
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
    glMatrixMode(GL_MODELVIEW)
    glLoadIdentity()
    for i in range(len(model.faces)):
        p0 = v[model.faces[i][0]]
        p1 = v[model.faces[i][1]]
        p2 = v[model.faces[i][2]]
        if (typeNum != 1):
            glBegin(GL_LINES)
            glColor3f(1, 1, 1)
            glVertex3f(p0[0], p0[1], p0[2])
            glVertex3f(p1[0], p1[1], p1[2])
            glVertex3f(p0[0], p0[1], p0[2])
            glVertex3f(p2[0], p2[1], p2[2])
            glVertex3f(p1[0], p1[1], p1[2])
            glVertex3f(p2[0], p2[1], p2[2])
            glEnd()
        if (typeNum != 2):
            glBegin(GL_TRIANGLES)
            glColor3f(0, 1, 1)
            glVertex3f(p0[0], p0[1], p0[2])
            glColor3f(1, 0, 1)
            glVertex3f(p1[0], p1[1], p1[2])
            glColor3f(1, 1, 0)
            glVertex3f(p2[0], p2[1], p2[2])
            glEnd()
    glFlush()


if __name__ == "__main__":
    glutInit()
    glutInitDisplayMode(GLUT_RGB | GLUT_SINGLE)
    glutInitWindowSize(600, 600)
    glutInitWindowPosition(100, 100)
    glutCreateWindow("CG_PRJ2")
    glClearColor(0, 0, 0, 0)
    glShadeModel(GL_SMOOTH)
    glutDisplayFunc(display)
    glutKeyboardFunc(keyboard)
    glutMainLoop()

运行效果如下图,还是非常不错的。

运行结果