11 KiB
CSE5519 Advances in Computer Vision (Topic E: 2021 and before: Deep Learning for Geometric Computer Vision)
Note
This topic is presented by Me. and will be the most detailed one for this course, perhaps.
PoseNet
A Convolutional Network for Real-Time 6-DOF Camera Relocalization (ICCV 2015)
Problem solving:
Camera Pose: Camera position and orientation.
Convolutional neural network (convnet) we train to estimate camera pose directly
from a monocular image, I. Our network outputs a pose
vector p, given by a 3D camera position x and orientation
represented by quaternion q:
p = [x, q]
q is a quaternion, x is a 3D camera position.
Arbitrary 4D values are easily mapped to legitimate rotations by normalizing them to unit length.
Regression function
Use Stochastic Gradient Descent (SGD) to optimize the network parameters.
loss(I)=\|\hat{x}-x\|_2+\beta\left\|\hat{q}-\frac{q}{\|q\|}\right\|_2
\hat{x} is the estimated camera position, x is the ground truth camera position, \hat{q} is the estimated camera orientation, q is the ground truth camera orientation.
\beta is a hyperparameter that scale the loss of the camera orientation so that the network can balance on estimating the camera position and orientation approximately the same weight.
Network architecture
Based on GoogLeNet (SOTA in 2014), but with a few changes:
- Replace all three softmax classifiers with affine regressors.
- Insert another fully connected layer before final regressor of feature size 2048
- At test time, normalize the quaternion to unit length.
Architecture
from network import Network
class GoogLeNet(Network):
def setup(self):
(self.feed('data')
.conv(7, 7, 64, 2, 2, name='conv1')
.max_pool(3, 3, 2, 2, name='pool1')
.lrn(2, 2e-05, 0.75, name='norm1')
.conv(1, 1, 64, 1, 1, name='reduction2')
.conv(3, 3, 192, 1, 1, name='conv2')
.lrn(2, 2e-05, 0.75, name='norm2')
.max_pool(3, 3, 2, 2, name='pool2')
.conv(1, 1, 96, 1, 1, name='icp1_reduction1')
.conv(3, 3, 128, 1, 1, name='icp1_out1'))
(self.feed('pool2')
.conv(1, 1, 16, 1, 1, name='icp1_reduction2')
.conv(5, 5, 32, 1, 1, name='icp1_out2'))
(self.feed('pool2')
.max_pool(3, 3, 1, 1, name='icp1_pool')
.conv(1, 1, 32, 1, 1, name='icp1_out3'))
(self.feed('pool2')
.conv(1, 1, 64, 1, 1, name='icp1_out0'))
(self.feed('icp1_out0',
'icp1_out1',
'icp1_out2',
'icp1_out3')
.concat(3, name='icp2_in')
.conv(1, 1, 128, 1, 1, name='icp2_reduction1')
.conv(3, 3, 192, 1, 1, name='icp2_out1'))
(self.feed('icp2_in')
.conv(1, 1, 32, 1, 1, name='icp2_reduction2')
.conv(5, 5, 96, 1, 1, name='icp2_out2'))
(self.feed('icp2_in')
.max_pool(3, 3, 1, 1, name='icp2_pool')
.conv(1, 1, 64, 1, 1, name='icp2_out3'))
(self.feed('icp2_in')
.conv(1, 1, 128, 1, 1, name='icp2_out0'))
(self.feed('icp2_out0',
'icp2_out1',
'icp2_out2',
'icp2_out3')
.concat(3, name='icp2_out')
.max_pool(3, 3, 2, 2, name='icp3_in')
.conv(1, 1, 96, 1, 1, name='icp3_reduction1')
.conv(3, 3, 208, 1, 1, name='icp3_out1'))
(self.feed('icp3_in')
.conv(1, 1, 16, 1, 1, name='icp3_reduction2')
.conv(5, 5, 48, 1, 1, name='icp3_out2'))
(self.feed('icp3_in')
.max_pool(3, 3, 1, 1, name='icp3_pool')
.conv(1, 1, 64, 1, 1, name='icp3_out3'))
(self.feed('icp3_in')
.conv(1, 1, 192, 1, 1, name='icp3_out0'))
(self.feed('icp3_out0',
'icp3_out1',
'icp3_out2',
'icp3_out3')
.concat(3, name='icp3_out')
.avg_pool(5, 5, 3, 3, padding='VALID', name='cls1_pool')
.conv(1, 1, 128, 1, 1, name='cls1_reduction_pose')
.fc(1024, name='cls1_fc1_pose')
.fc(3, relu=False, name='cls1_fc_pose_xyz'))
(self.feed('cls1_fc1_pose')
.fc(4, relu=False, name='cls1_fc_pose_wpqr'))
(self.feed('icp3_out')
.conv(1, 1, 112, 1, 1, name='icp4_reduction1')
.conv(3, 3, 224, 1, 1, name='icp4_out1'))
(self.feed('icp3_out')
.conv(1, 1, 24, 1, 1, name='icp4_reduction2')
.conv(5, 5, 64, 1, 1, name='icp4_out2'))
(self.feed('icp3_out')
.max_pool(3, 3, 1, 1, name='icp4_pool')
.conv(1, 1, 64, 1, 1, name='icp4_out3'))
(self.feed('icp3_out')
.conv(1, 1, 160, 1, 1, name='icp4_out0'))
(self.feed('icp4_out0',
'icp4_out1',
'icp4_out2',
'icp4_out3')
.concat(3, name='icp4_out')
.conv(1, 1, 128, 1, 1, name='icp5_reduction1')
.conv(3, 3, 256, 1, 1, name='icp5_out1'))
(self.feed('icp4_out')
.conv(1, 1, 24, 1, 1, name='icp5_reduction2')
.conv(5, 5, 64, 1, 1, name='icp5_out2'))
(self.feed('icp4_out')
.max_pool(3, 3, 1, 1, name='icp5_pool')
.conv(1, 1, 64, 1, 1, name='icp5_out3'))
(self.feed('icp4_out')
.conv(1, 1, 128, 1, 1, name='icp5_out0'))
(self.feed('icp5_out0',
'icp5_out1',
'icp5_out2',
'icp5_out3')
.concat(3, name='icp5_out')
.conv(1, 1, 144, 1, 1, name='icp6_reduction1')
.conv(3, 3, 288, 1, 1, name='icp6_out1'))
(self.feed('icp5_out')
.conv(1, 1, 32, 1, 1, name='icp6_reduction2')
.conv(5, 5, 64, 1, 1, name='icp6_out2'))
(self.feed('icp5_out')
.max_pool(3, 3, 1, 1, name='icp6_pool')
.conv(1, 1, 64, 1, 1, name='icp6_out3'))
(self.feed('icp5_out')
.conv(1, 1, 112, 1, 1, name='icp6_out0'))
(self.feed('icp6_out0',
'icp6_out1',
'icp6_out2',
'icp6_out3')
.concat(3, name='icp6_out')
.avg_pool(5, 5, 3, 3, padding='VALID', name='cls2_pool')
.conv(1, 1, 128, 1, 1, name='cls2_reduction_pose')
.fc(1024, name='cls2_fc1')
.fc(3, relu=False, name='cls2_fc_pose_xyz'))
(self.feed('cls2_fc1')
.fc(4, relu=False, name='cls2_fc_pose_wpqr'))
(self.feed('icp6_out')
.conv(1, 1, 160, 1, 1, name='icp7_reduction1')
.conv(3, 3, 320, 1, 1, name='icp7_out1'))
(self.feed('icp6_out')
.conv(1, 1, 32, 1, 1, name='icp7_reduction2')
.conv(5, 5, 128, 1, 1, name='icp7_out2'))
(self.feed('icp6_out')
.max_pool(3, 3, 1, 1, name='icp7_pool')
.conv(1, 1, 128, 1, 1, name='icp7_out3'))
(self.feed('icp6_out')
.conv(1, 1, 256, 1, 1, name='icp7_out0'))
(self.feed('icp7_out0',
'icp7_out1',
'icp7_out2',
'icp7_out3')
.concat(3, name='icp7_out')
.max_pool(3, 3, 2, 2, name='icp8_in')
.conv(1, 1, 160, 1, 1, name='icp8_reduction1')
.conv(3, 3, 320, 1, 1, name='icp8_out1'))
(self.feed('icp8_in')
.conv(1, 1, 32, 1, 1, name='icp8_reduction2')
.conv(5, 5, 128, 1, 1, name='icp8_out2'))
(self.feed('icp8_in')
.max_pool(3, 3, 1, 1, name='icp8_pool')
.conv(1, 1, 128, 1, 1, name='icp8_out3'))
(self.feed('icp8_in')
.conv(1, 1, 256, 1, 1, name='icp8_out0'))
(self.feed('icp8_out0',
'icp8_out1',
'icp8_out2',
'icp8_out3')
.concat(3, name='icp8_out')
.conv(1, 1, 192, 1, 1, name='icp9_reduction1')
.conv(3, 3, 384, 1, 1, name='icp9_out1'))
(self.feed('icp8_out')
.conv(1, 1, 48, 1, 1, name='icp9_reduction2')
.conv(5, 5, 128, 1, 1, name='icp9_out2'))
(self.feed('icp8_out')
.max_pool(3, 3, 1, 1, name='icp9_pool')
.conv(1, 1, 128, 1, 1, name='icp9_out3'))
(self.feed('icp8_out')
.conv(1, 1, 384, 1, 1, name='icp9_out0'))
(self.feed('icp9_out0',
'icp9_out1',
'icp9_out2',
'icp9_out3')
.concat(3, name='icp9_out')
.avg_pool(7, 7, 1, 1, padding='VALID', name='cls3_pool')
.fc(2048, name='cls3_fc1_pose')
.fc(3, relu=False, name='cls3_fc_pose_xyz'))
(self.feed('cls3_fc1_pose')
.fc(4, relu=False, name='cls3_fc_pose_wpqr'))
Unsupervised Learning of Depth and Ego-Motion From Video
(CVPR 2017)
This is a method that estimates both depth and camera pose motion from a single video using CNN.
Jointly training a single-view depth CNN and a camera pose estimation CNN form unlabelled monocular video sequences.
View synthesis as supervision
Notice that in mo
Unsupervised Monocular Depth Estimation with Left-Right Consistency
(CVPR 2017)
This is a method that use pair of images as Left and Right eye to estimate depth. Increased consistency by flipping the right-left relation.
GeoNet
Unsupervised Learning of Dense Depth, Optical Flow and Camera Pose (CVPR 2018)
Problem solving:
Depth Estimation from single monocular image.
Rigid structure constructor
Combines the DepthNet and PoseNet to estimate the depth and camera pose motion from Unsupervised Learning of Depth and Ego-Motion From Video.
Non-rigid motion localizer
Use Left-right consistency to estimate the non-rigid motion by training the ResFlowNet.
Geometric consistency enforcement
Finally, we use an additional geometric consistency enforcement to handle non-Lambertian surfaces (e.g., metal, plastic, etc.).

