Pre
## Loss ##
loss = C1 x pixel_loss / (depth_loss / 4) + C2 x depth_loss # (emperically, 1:4 loss ratio)
exp1 | not trainable
Only Depth loss
|
|
Train DT = Train |
Val DT = Val |
lr = 0.0001 |
| Audio Model |
Audio |
Image Model |
Image |
Others |
| ResNet 18 |
1D |
ResNet 18 |
2q mask Image |
Batch Size = 4 |
| Decoder |
|
|
|
|
| temporal upconv. |
|
|
|
|
exp2 | not trainable
Pixel : Depth loss = 1 : 1
|
|
Train DT = Train |
Val DT = Val |
lr = 0.0001 |
| Audio Model |
Audio Input |
Image Model |
Image Input |
Others |
| ResNet 18 |
1D |
ResNet 18 |
2q mask Image |
Batch Size = 4 |
| Not Freeze, None |
|
Freeze, mp3d |
|
|
|
|
|
|
|
exp3 | not trainable
Pixel : Depth loss = 1 : 10
|
|
Train DT = Train |
Val DT = Val |
lr = 0.0001 |
| Audio Model |
Audio Input |
Image Model |
Image Input |
Others |
| ResNet 18 |
1D |
ResNet 18 |
2q mask Image |
Batch Size = 4 |
| Not Freeze, None |
|
Freeze, mp3d |
|
|
|
|
|
|
|