Hi all,
I'm trying to remove the duplicate rows from excel files in the fastest way possible.
I initially tried the below code (but it's very slow)
- private void RemoveDuplicate(Worksheet sheet)
- {
-
- for (int i = 0; i < sheet.Columns[0].Count - 1; i++)
- {
- for (int j = i + 1; j < sheet.Columns[0].Count; j++)
- {
- if (sheet.Columns[0].CellList[i].Value == sheet.Columns[0].CellList[j].Value)
- {
- for (int col = 0; col < sheet.Rows[0].Count; col++)
- {
- if (col == sheet.Rows[0].Count - 1 && sheet.Rows[i].CellList[col].Value == sheet.Rows[j].CellList[col].Value)
- {
- sheet.DeleteRow(j + 1);
- j--;
- }
- else if (col != sheet.Rows[0].Count - 1 && sheet.Rows[i].CellList[col].Value == sheet.Rows[j].CellList[col].Value)
- {
- continue;
- }
- else
- {
- break;
- }
- }
- }
- }
-
- }
- }
Then using it like
- testWorkbook.LoadFromFile(filePath);
- testSheet = testWorkbook.Worksheets[0];
-
- RemoveDuplicate(testSheet);
Seeing it's slowness I tried a different approach (show below), but it is not working:
- private void RemoveDuplicate2(Worksheet sheet, CellRange cellRanges)
- {
-
- var duplicatedRows = cellRanges.Rows
- .GroupBy(x=> x.Columns[0].Value)
- .Where(x=> x.Count() > 1)
- .Select(x=> x.Last().Columns[0].Row)
- .ToList();
-
- foreach (var element in duplicatedRows)
- {
- sheet.DeleteRow(element);
- }
-
- }
Then using it like
- testWorkbook.LoadFromFile(filePath);
- testSheet = testWorkbook.Worksheets[0];
-
-
- var cellRanges = testSheet.Range["A2:G" + testSheet.Columns[0].Count];
-
- RemoveDuplicate2(testSheet, cellRanges);
Please help me on this ...