Reconstruction of image from segmented chunks from multiple frames

Posted: January 9, 2012 at 3:47 pm

Above is a reconstruction of approximately 20 frames of video test footage. The reconstruction is composed of the perceptual chunks that were segmented (as described in previous posts). White areas are areas where no segmentation was performed. So far this just looks like a normal long exposure, but things should get more aesthetically interesting once these percepts are treated individually.

Some rudimentary clustering is done in this version, which is supposed to correspond to perceptual association (the persistence of a class of an object over time). In this case, percepts are merged if they are similar enough (according to histogram, position and scale). Merging currently only happens with static percepts (background objects) which have not moved. Dynamic object merges will also happen next, where a single perceptual chunk will be associated with multiple locations in space. Note that for this test data scale is also used as a merging feature, because nearly all movement is on a 2D plane. If installed in a location like a public square, movement will happen in all directions and aspect ratio will be used rather than scale.

One thing to note is that the function that calculates a distance matrix for the percepts is very slow, taking 90 seconds for approximately 6000 perceptual chunks.

Below is the code in it’s current state (It will soon grow beyond a single file at this rate!)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/gpu/gpu.hpp" // For GPU processing

#include <iostream>
#include <fstream> // for writing to files.
#include <stdio.h>
#include <sys/time.h>
#include <vector> // For dynamic arrays.
#include <set> // for unique sets of values
#include <cmath>

#define MAXSTRING 50 // 50 chars enough?

using namespace cv;
using namespace std;

// Class to hold the perceptual chunks.
class percepUnit {

    // externally accessible class members?
    public:
        Mat image; // percept itself
        Mat mask; // alpha channel
        Mat Lhist,RLhist; // hist Luminance (normalized and Raw)
        Mat Uhist,RUhist; // hist Blue - Yellow
        Mat Vhist,RVhist; // hist Green - Red
        int x1, y1; // close edges
        int w, h; // width / height of patch
        int x2, y2; // coord of far edges
        int cx, cy; // location of percept in frame
        // TODO add a vector to hold an array of points when this percept is perceptually repeated?
        int time; // location of percept in time (frame number)
        bool remove; // used to delete percepts

        // constructor method
        percepUnit(Mat ROI, Mat alpha, int ix, int iy, int iw, int ih, int itime) {
            ROI.copyTo(image); // make copies since args are refs!!
            alpha.copyTo(mask);
            x1 = ix;
            y1 = iy;
            w = iw;
            h = ih;
            time = itime;
            x2 = x1 + iw;
            y2 = y1 + ih;
            cx = x1 + (w / 2);
            cy = y1 + (h / 2);

            remove = false;

            calcFeatures(); // calculate features on construction!
        }

        // Dumps data
        void dump(int id) {
            ofstream outputFile;
            char filenameData[MAXSTRING];
            sprintf(filenameData, "data/%d_%d_data.csv", id, time); // TODO is this the proper C++ way?
            outputFile.open(filenameData, ios::out);
            outputFile << "instance,time,cx,cy,lhist,uhist,vhist\n";
            for (int i=0; i<256; i++) {
                outputFile << id << "," << time << "," << cx << "," << cy << "," <<
                Lhist.at<float>(i) << "," << Uhist.at<float>(i) << "," << Vhist.at<float>(i) << endl;
            }
            outputFile.close();
        }

        // Write image and mask to disk
        void writeImages(int id) {
            char filenameImage[MAXSTRING], filenameMask[MAXSTRING];
            // Write regions to Disk. TODO make sure "region" folder exists, create it if not
            sprintf(filenameImage, "regions/%d_image.jpg", id); // TODO is this the proper C++ way?
            sprintf(filenameMask, "regions/%d_mask.jpg", id);
            imwrite(filenameImage, image);
            imwrite(filenameMask, mask);
        }

        void calcFeatures() {
            Mat luvImage;
            vector<Mat> imagePlanes;
            int bins = 256; // 256 bins/channel

            cvtColor(image, luvImage, CV_BGR2Luv); // convert from BGR to LUV
            split(luvImage, imagePlanes); // Scale and Split channels
           
            // Histograms
            calcHist(&imagePlanes[0], 1, 0, mask, Lhist, 1, &bins, 0); // L
            calcHist(&imagePlanes[1], 1, 0, mask, Uhist, 1, &bins, 0); // U
            calcHist(&imagePlanes[2], 1, 0, mask, Vhist, 1, &bins, 0); // V

            // Normalize Histograms (in place) TODO switch to GPU?
            normalize(Lhist, Lhist, 0, 1, CV_MINMAX);
            normalize(Uhist, Uhist, 0, 1, CV_MINMAX);
            normalize(Vhist, Vhist, 0, 1, CV_MINMAX);
        }

        // Destructor
        // TODO the descructor gets called for every vector push_back because of resize. If I change to a list will that avoid this problem?
        ~percepUnit() {
            // Free memory
            image.release();
            mask.release();
            Lhist.release();
            Uhist.release();
            Vhist.release();

            // Debug
            //cout << "instance deleted" << endl;
        }
};

// pixel by pixel offset copy of perceptUnit into larger image.
int copyPercept(percepUnit &unit, Mat &dest) {

    // Loop through pixels in percept image
    for( int y = 0; y < unit.image.rows; y++ ) {
        for( int x = 0; x < unit.image.cols; x++ ) {
            // Make sure this pixel is in the mask.
            if (unit.mask.at<char>(y,x) != 0) {
                // get pixels from src image:
                Vec3b pixel = unit.image.at<Vec3b>(y,x); // Vec3b is a 3 element vector of unsigned chars.

                // set pixels in dest image (offset for this percept)
                dest.at<Vec3b>(y+unit.y1,x+unit.x1) = pixel;
            }
        }
    }

    return(0);
}

// get the current time (for rough profiling)
double getTime() {
    timeval rawtime;
    double time;

    gettimeofday(&rawtime, NULL);
    time = rawtime.tv_sec+(rawtime.tv_usec/1000000.0);
    return(time);
}

// Process one frame, append items to an existing vector of percepUnits.
int segmentImage(Mat &image, vector<percepUnit> &percepUnits, int frameNum) {

    Mat orig, meanshift,meanshift4C, mask, flood, image4C;
    gpu::GpuMat gpuImage, temp1, temp2, temp3;
    double t1, t2;
    int numRegions = 0;
    int numROIs = 0;
    int area, rectArea;
    Rect *boundingRect = new Rect(); // Stored bounding box for each flooded area.

    t1 = getTime();

    // Try and do hard image processing on the GPU: (only works on micro!)

    // Copy the original image for in-place processing.
    image.copyTo(orig); // Main memory version.

    // convert image to 4 channels and upload to GPU for GPU operations.
    // TODO try and use Luv rather than BGR for these operations? How can you have a 4 channel luv image for gpu?
    cvtColor(image, image4C, CV_BGR2BGRA,4); // 4 channels for gpu operation
    gpuImage.upload(image4C);

    // morphology (supports in place operation)
    Mat element = getStructuringElement(MORPH_ELLIPSE, Size(5,5), Point(2, 2) );
    gpu::morphologyEx(gpuImage, temp1, MORPH_CLOSE, element);
    gpu::morphologyEx(temp1, temp2, MORPH_OPEN, element);

    // Mean shift filtering
    TermCriteria iterations = TermCriteria(CV_TERMCRIT_ITER, 2, 0);
    gpu::meanShiftFiltering(temp2, temp3, 10, 40, iterations); // 10, 40, 2 iterations

    // Download and convert to 3 channels the processed image from GPU to main memory: TODO where to free gpuMat?
    temp3.download(meanshift4C);
    cvtColor(meanshift4C,meanshift,CV_BGRA2BGR, 3); // convert to three channels

    // place to store ffill masks
    mask = Mat( meanshift.rows+2, meanshift.cols+2, CV_8UC1, Scalar::all(0) ); // Make black single-channel image.
    meanshift.copyTo(flood); // copy image

    // Loop through all the pixels and flood fill.
    // don't bother starting in the upper left corner, due to the edge effects.
    for( int y = 5; y < flood.rows; y++ )
    {
        for( int x = 5; x < flood.cols; x++ )
        {
            if( mask.at<uchar>(y+1, x+1) == 0 ) // mask is offset from original image.
            {
                numRegions++;
                //Flags: connectivity=4, fill value = 255, boundingRect is the size of the filled region. additional flags?
                // testing: area = floodFill( flood, mask, Point(x,y), NULL, boundingRect, Scalar::all(10), Scalar::all(10), (4|255<<8)+FLOODFILL_FIXED_RANGE);
                area = floodFill( flood, mask, Point(x,y), NULL, boundingRect, Scalar::all(1), Scalar::all(1), (8|255<<8));
                //Extract a subimage for each flood, if the flood is large enough.
                rectArea = boundingRect->width*boundingRect->height;
                if (area > 100 and area < 60000) {
                //if (rectArea > 10 && rectArea < 2056236) { // greater than 20x20 and smaller than 1920x1080
                   
                    Mat ROI = orig(*boundingRect); // Make a cropped reference (not copy) of the image

                    // crop translated mask to register with original image.
                    boundingRect->y++;
                    boundingRect->height++;
                    boundingRect->x++;
                    boundingRect->width++;
                    Mat tmp = mask(*boundingRect);
                    Mat alpha = tmp(Range(0,tmp.rows-1),Range(0,tmp.cols-1)); // crop mask to match image

                    // Append an instance to the vector.
                    percepUnits.push_back(percepUnit(ROI, alpha, boundingRect->x-1, boundingRect->y-1, boundingRect->width-1, boundingRect->height-1, frameNum));
                    numROIs++;
                }
            }
        }
    }

    t2 = getTime();
   
    // TODO add a debug mode.
    cout << "Processing Time (segmentation+feature calculation): " << t2-t1 << endl;
    cout << "Total Regions: " << numRegions << endl;
    cout << "ROI Regions: " << numROIs << endl;

    // release GpuMats (is this needed or done automatically by return()?
    gpuImage.release();
    temp1.release();
    temp2.release();
    temp3.release();
   
    return(0);
}

// Function to construct an accumulation.
Mat mergeImages(Mat inputImage1, Mat inputImage2) {
   
    int width, height;
    Mat merged, smallImage, bigImage, image1, image2;

    // copy input images TODO find a way to do this without the copy?
    inputImage1.copyTo(image1);
    inputImage2.copyTo(image2);

    // use the resolution and aspect of the largest of the pair.
    if (image1.rows > image2.rows)
        width = image1.rows;
    else
        width = image2.rows;

    if (image1.cols > image2.cols)
        height = image1.cols;
    else
        height = image2.cols;

    // rescale the smaller images to match the larger.
    if ((image1.rows * image1.cols) > (image2.rows * image2.cols)) {
        resize(image2, smallImage, image1.size(), INTER_LINEAR ); // TODO Linear interpolation good enough?
        bigImage = image1;
    } else {
        resize(image1, smallImage, image2.size(), INTER_LINEAR );
        bigImage = image2;
    }
   
    merged = Mat(height, width, CV_8UC3);

    merged = (bigImage+smallImage)/2; // mean of two images.

    return(merged);
}

// Merge two perceps and return a new percept to replace them.
// TODO do the same thing for time as you do with position? (mean if below a threshols, vector otherwise?)
percepUnit mergeStaticPerceps(vector<percepUnit> &percepUnits, int mergeID, vector<int> &merges) {
    Mat accumulation;
    Mat accumulationMask;
    int x, y, w, h, t;
    int Xs, Ys, Ws, Hs;
    int meanX, meanY, meanW, meanH;

    percepUnits.at(mergeID).image.copyTo(accumulation); // since we're not really merging but making new images from merges.
    percepUnits.at(mergeID).mask.copyTo(accumulationMask);
    percepUnits.at(mergeID).remove = true;

    Xs = percepUnits.at(mergeID).x1;
    Ys = percepUnits.at(mergeID).y1;
    Ws = percepUnits.at(mergeID).w;
    Hs = percepUnits.at(mergeID).h;

/*cout << "IDs:           " << mergeID << endl;
cout << "initial X: " << percepUnits.at(mergeID).x1 << endl;
cout << "IXs: " << Xs << endl;*/


    for (int j = 0; j < merges.size(); j++) {
        // Merge Images
        int match = merges.at(j);
        accumulation = mergeImages(accumulation, percepUnits.at(match).image);
        accumulationMask = mergeImages(accumulationMask, percepUnits.at(match).mask);

        // sums of positions, widths and heights of merging percepts
        Xs += percepUnits.at(match).x1;
        Ys += percepUnits.at(match).y1;
        Ws += percepUnits.at(match).w;
        Hs += percepUnits.at(match).h;

    //cout << "Xs " << Xs << endl;

        percepUnits.at(match).remove = true;
    }

    meanX = Xs/(merges.size()+1); // sum of matching Xs divided by the number of matches+orig.
    meanY = Ys/(merges.size()+1);
    meanW = Ws/(merges.size()+1);
    meanH = Hs/(merges.size()+1);

//cout << "Xmean: " << meanX << endl << endl;

    percepUnit mergedUnit = percepUnit(accumulation, accumulationMask, meanX, meanY, meanW, meanH, 0);

    return(mergedUnit);
    //percepUnits.push_back(percepUnit(ROI, alpha, boundingRect->x-1, boundingRect->y-1, boundingRect->width-1, boundingRect->height-1, frameNum));
}

// Determine which static percepts should be merged
// since the vector gets reallocated for each frame, a list may be much more efficient.
vector< vector<int> > calcStaticMerges(vector<percepUnit> &percepUnits, Mat distances) {

    double areaDiff, posDiff, distance;
    vector< vector<int> > merges; // a vector of vectors of ints (matches).
    vector<int> matches; // vector of ints

    for(int i = 0; i <distances.rows; i++) {
        matches.clear(); // clear matches for each percep
        for(int j = 0; j < distances.cols; j++) { // j<i makes removes redundancy in the matrix (ragged)
            distance = distances.at<float>(i,j);
            if ((distance != -1) and (i != j)) {
                if (distance > .8) {
                    areaDiff = abs( (percepUnits[i].w * percepUnits[i].h) - (percepUnits[j].w * percepUnits[j].h) );
                    posDiff = sqrt((float)pow(percepUnits[i].cx-percepUnits[j].cx,2)+pow((float)percepUnits[i].cy-percepUnits[j].cy,2));
                    if (areaDiff < 0.05 and posDiff < 2) {
                        matches.push_back(j); // push the matches (j) for this particular percep (i)
                    }
                }
            }
        }
        merges.push_back(matches); // append matches for this particular percep (i) to the global vector
    }

    return(merges);
}

// calculate a distance matrix for all percepts at this time step.
// TODO how to do these if we change from vector to list using iterators? (without redundancy?) Ask Philippe.
Mat calcDistances(vector<percepUnit> &percepUnits) {
    double Ldist, Udist, Vdist, meanDist, areaDiff;
    Mat distances;
    double t1, t2;
    t1 = getTime();

    distances = Mat(percepUnits.size(),percepUnits.size(), CV_32F, -1);

    for(int i = 0; i <percepUnits.size(); i++) {
        for(int j = 0; j <i; j++) { // j<i makes removes redundancy in the matrix (ragged)
            // TODO which method to use?
            // Chi-Square (method=CV_COMP_CHISQR)
            // Intersection (method=CV_COMP_INTERSECT)
            // Bhattacharyya distance (method=CV_COMP_BHATTACHARYYA)
            Ldist = compareHist(percepUnits[i].Lhist, percepUnits[j].Lhist, CV_COMP_CORREL);
            Udist = compareHist(percepUnits[i].Uhist, percepUnits[j].Uhist, CV_COMP_CORREL);
            Vdist = compareHist(percepUnits[i].Vhist, percepUnits[j].Vhist, CV_COMP_CORREL);

            meanDist = (Ldist+Udist+Vdist)/3; // TODO try weighting luma over colour?

            distances.at<float>(i,j) = meanDist;
        }
    }

    t2 = getTime();
    cout << "Processing Time (distance calculation): " << t2-t1 << endl;

    return(distances);
}

void help(string programName) {
    cout << "Usage: " << programName << " [image filename | sequence format string] [--frame | --sequence] --reconstruction --dump" << endl;
    exit(1);
}

// Predicate to use remove_if to delete merge inputs.
bool deleteThisPercep(const percepUnit &unit) {
    return unit.remove;
}

int main(int argc, char* argv[])
{
    Mat image, reconstruction, distances;
    bool writeReconstruction = false;
    bool dumpData = false;
    bool doSegmentationFrame = false;
    bool doSegmentationSequence = false;
    string filePattern;
    char filename[MAXSTRING];
    double t1, t2;
    vector<percepUnit> percepUnits; // dynamic vector to store instances of percepUnit.
    vector< vector<int> > staticMerges; // vector of merges to be done.

    // Test in case there are no arguments.
    if (argc <= 2) {
        help(argv[0]);
        exit(1);
    }

    filePattern = argv[1];

    // Loop through remaining arguments
    for (int i=2; i<argc; i++) {
        string arg = argv[i]; // why does argv[i] == "-f" not work?

        // only check args that start with --
        if (arg.find("--") == 0) {
            if (!arg.compare("--frame")) {
                doSegmentationFrame = true;
            } else if (!arg.compare("--sequence")) {
                doSegmentationSequence = true;
            } else if (!arg.compare("--reconstruction")) {
                writeReconstruction = true;
            } else if (!arg.compare("--dump")) {
                dumpData = true;
            } else {
                help(argv[0]);
            }
        } else {
            help(argv[0]);
        }
    }

    if (doSegmentationFrame) {
       
        image = imread( filePattern ); // load input image from disk to main memory.
        CV_Assert( not image.empty() ); // bail if image load fails.

        segmentImage(image, percepUnits, 0); // if we process one frame the time of all percepts is set to 0.

        if (writeReconstruction) { // disable reconstructions of a sequence for now.

            reconstruction = Mat(1080,1920, CV_8UC3, Scalar(255,255,255)); // white background
            for(int i = 0; i <percepUnits.size(); i++) {
                copyPercept(percepUnits[i], reconstruction);
            }

            imwrite("reconstruction.png",reconstruction);
        }

        if (dumpData) { // currently only possible in frame mode.

            // Calculate features for all percepUnits
            for(int i = 0; i <percepUnits.size(); i++) {
                percepUnits[i].dump(i);
                percepUnits[i].writeImages(i);
            }
        }

    } else if (doSegmentationSequence) {

        // Main sequence loop
        for (int frameNum=1; frameNum < 40; frameNum+=2) { // TODO make these numbers setable from command flags.

            // Construct filenames from formatstring:
            char *filePatternChar = (char*)filePattern.c_str();
            sprintf(filename, filePatternChar, frameNum);
           
            image = imread( filename );
            CV_Assert( not image.empty() );

            segmentImage(image, percepUnits, frameNum);

            // calculate features and dump (images!)
            if (dumpData) {
                for(int i = 0; i <percepUnits.size(); i++) {
                    percepUnits[i].dump(i);
                    percepUnits[i].writeImages(i); // TODO this was commented before
                }
            }

            // calc distances

            // merge perceps

            // linkage clustering
        }

    } else {
        cout << "You must specify either --frame or --sequence." << endl;
        help(argv[0]);
    }

    // The following items will eventually move into the main loop for each frame,
    // but for testing we're only running it after a number of frames have been processed.
   
    // calc distances
    // TODO when doing clustering it likely makes sense to have a shared distance matrix for positions and other features.
    cout << "calculating distances" << endl;
    distances = calcDistances(percepUnits);

    // calc static merges
    staticMerges = calcStaticMerges(percepUnits, distances);

    // calc moving merges

    // Merge percepts
    cout << "merging static percepts" << endl;
    for (int i = 0; i < staticMerges.size(); i++) { // for each merge
        if (staticMerges[i].size() > 0) {
            //percepUnit tmp = mergePerceps(percepUnits, i, merges[i]);
            percepUnits.push_back(mergeStaticPerceps(percepUnits, i, staticMerges[i]));
        }
    }

    // delete merges!
    cout << "deleting percepts" << endl;
    cout << "size before erase: " << percepUnits.size() << endl;
    // The predicate (deleteThisPercep) is a function that takes each instance as an argument and returns true or false.
    // remove_if returns iterators for all elements where the predicate is true, between the ranges specified.
    // erase removes (and deconstructs) items contained in the iterator returned by remove_if. (until the end of the list?)
    percepUnits.erase(remove_if(percepUnits.begin(), percepUnits.end(), deleteThisPercep), percepUnits.end());
    cout << "size after erase: " << percepUnits.size() << endl;

    // Draw a reconstruction from multiple frames.
    reconstruction = Mat(1080,1920, CV_8UC3, Scalar(255,255,255)); // white background
    for(int i = 0; i <percepUnits.size(); i++) {
        copyPercept(percepUnits[i], reconstruction);
    }
    imwrite("reconstruction.png",reconstruction);

    percepUnits.clear(); // cleaup*/
    return(0);
}