C # - 30 giây
Một cách tiếp cận khác với hầu hết nếu tôi đọc đúng - tôi không sử dụng bất kỳ cấu trúc dựa trên hàm băm nào.
Tôi có xu hướng không nhận được kết quả, không chắc đây có phải là sự bất thường về thống kê hay lỗi trong lý luận của tôi không. Đã sửa, so sánh cho loại nhị phân là thiếu sót.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
namespace FilterFile
{
class Program
{
const int COUNT = 50000000;
static string inputFile = "data" + COUNT + ".txt";
static string outputFile = "results.txt";
static void Main(string[] args)
{
Console.WriteLine("Prepping Test");
if (args.Length > 0) inputFile = args[0];
if (args.Length > 1) outputFile = args[1];
if (!File.Exists(inputFile))
{
Console.WriteLine(inputFile);
File.WriteAllLines(inputFile,
GenerateData(COUNT)
.Select(r => string.Format("{0} {1} {2}", r.A, r.B, r.C)));
}
File.Delete("results.txt");
Console.WriteLine("Starting Test \n\n");
using (Timer.Create("Total Time"))
{
Row[] sortedA, sortedB;
//http://codegolf.stackexchange.com/questions/26643/filter-a-large-file-quickly
using (Timer.Create("Reading Data"))
FillData(out sortedA, out sortedB);
using (Timer.Create("Parallel Sort A"))
ParallelSort.QuicksortParallel(sortedA);
using (Timer.Create("Parallel Sort B"))
ParallelSort.QuicksortParallel(sortedB, (x, y) => x.B - y.B);
object rLock = new object();
List<Row> results = new List<Row>();
var comparison = Comparer<Row>.Create((B, A) => B.B - A.A);
using (Timer.Create("Compute Results"))
Parallel.ForEach(sortedA, row =>
//foreach (var row in sortedA)
{
var i = Array.BinarySearch(sortedB, row, comparison);
if (i < 0) return;
Row other;
bool solved = false;
for (var tempI = i; tempI < sortedB.Length && row.A == (other = sortedB[tempI]).B; tempI++)
{
var diff = row.C - other.C;
if (diff >= 0 && diff < 100)
{
lock (rLock) results.Add(row);
return;
}
}
for (var tempI = i - 1; tempI >= 0 && row.A == (other = sortedB[tempI]).B; tempI--)
{
var diff = row.C - other.C;
if (diff >= 0 && diff < 100)
{
lock (rLock) results.Add(row);
return;
}
}
});
using (Timer.Create("Save Results"))
{
File.WriteAllLines(outputFile, results.Select(r => r.ToString()));
}
}
}
private static void FillData(out Row[] sortedA, out Row[] sortedB)
{
var tempA = new Row[COUNT];
var tempB = tempA;//new Row[COUNT];
const int PARTITION_SIZE = 1 << 22;
ReadAndSort(tempA, tempB, PARTITION_SIZE);
sortedA = tempA;
sortedB = new Row[COUNT];
Array.Copy(sortedA, sortedB, COUNT);
/*using (Timer.Create("MergeA"))
{
int destIndex = 0;
int[][] partitions = Enumerable.Range(0, COUNT / PARTITION_SIZE + 1)
.Select(i => new[] { i * PARTITION_SIZE, Math.Min(i * PARTITION_SIZE + PARTITION_SIZE, COUNT) - 1 })
.ToArray();
for (int i = 0; i < COUNT; i++)
{
foreach (var partition in partitions)
{
while (partition[0] <= partition[1] && tempA[partition[0]].A == i)
{
sortedA[destIndex++] = tempA[partition[0]++];
}
}
}
}*/
/*//Verify Paritioning Works
var results = new List<Tuple<Row, int>> { Tuple.Create(tempA[0], 0) };
for (int i = 1; i < tempA.Length; i++)
{
var r = tempA[i];
if (r.A < tempA[i-1].A)
results.Add(Tuple.Create(r, i % PARTITION_SIZE));
}
results.ForEach(t => Console.WriteLine(t.Item1 + " " + t.Item2));*/
}
private static void ReadAndSort(Row[] tempA, Row[] tempB, int PARTITION_SIZE)
{
List<Task> tasks = new List<Task>();
using (var stream = File.OpenRead(inputFile))
{
int b;
int tempMember = 0;
int memberIndex = 0;
int elementIndex = 0;
using (Timer.Create("Read From Disk"))
while ((b = stream.ReadByte()) >= 0)
{
switch (b)
{
case (byte)'\r':
case (byte)' ':
switch (memberIndex)
{
case 0: tempA[elementIndex].A = tempMember; memberIndex = 1; break;
case 1: tempA[elementIndex].B = tempMember; memberIndex = 2; break;
case 2: tempA[elementIndex].C = tempMember; memberIndex = 0; break;
}
tempMember = 0;
break;
case (byte)'\n':
/*if (elementIndex % PARTITION_SIZE == 0 && elementIndex > 0)
{
var copiedIndex = elementIndex;
tasks.Add(Task.Run(() =>
{
var startIndex = copiedIndex - PARTITION_SIZE;
Array.Copy(tempA, startIndex, tempB, startIndex, PARTITION_SIZE);
ParallelSort.QuicksortSequentialInPlace(tempA, startIndex, copiedIndex - 1);
ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, copiedIndex - 1, (x, y) => x.B - y.B);
}));
}*/
elementIndex++;
break;
default:
tempMember = tempMember * 10 + b - '0';
break;
}
}
/* tasks.Add(Task.Run(() =>
{
elementIndex--; //forget about the last \n
var startIndex = (elementIndex / PARTITION_SIZE) * PARTITION_SIZE;
Array.Copy(tempA, startIndex, tempB, startIndex, elementIndex - startIndex + 1);
ParallelSort.QuicksortParallelInPlace(tempA, startIndex, elementIndex);
ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, elementIndex, (x, y) => x.B - y.B);
}));
using (Timer.Create("WaitForSortingToFinish"))
Task.WaitAll(tasks.ToArray());*/
}
}
static Random rand = new Random();
public struct Row : IComparable<Row>
{
public int A;
public int B;
public int C;
public static Row RandomRow(int count)
{
return new Row { A = rand.Next(count), B = rand.Next(count), C = rand.Next(count) };
}
public int CompareTo(Row other)
{
return A - other.A;
}
public override string ToString()
{
return string.Format("{0} {1} {2}", A, B, C);
}
}
public static Row[] GenerateData(int count)
{
var data = new Row[count];
for (int i = 0; i < count; i++)
data[i] = Row.RandomRow(count);
return data;
}
public static Row[] GenerateSplitData(int count)
{
var data = new Row[count];
for (int i = 0; i < count; i++)
data[i] = Row.RandomRow(count);
return data;
}
public class Timer : IDisposable
{
string message;
Stopwatch sw;
public static Timer Create(string message)
{
Console.WriteLine("Started: " + message);
var t = new Timer();
t.message = message;
t.sw = Stopwatch.StartNew();
return t;
}
public void Dispose()
{
Console.WriteLine("Finished: " + message + " in " + sw.ElapsedMilliseconds + "ms");
}
}
// <summary>
/// Parallel quicksort algorithm.
/// </summary>
public class ParallelSort
{
const int SEQUENTIAL_THRESHOLD = 4096;
#region Public Static Methods
/// <summary>
/// Sequential quicksort.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortSequential<T>(T[] arr) where T : IComparable<T>
{
QuicksortSequentialInPlace(arr, 0, arr.Length - 1);
}
/// <summary>
/// Parallel quicksort
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortParallel<T>(T[] arr) where T : IComparable<T>
{
QuicksortParallelInPlace(arr, 0, arr.Length - 1);
}
#endregion
#region Private Static Methods
public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
int pivot = Partition(arr, left, right);
QuicksortSequentialInPlace(arr, left, pivot - 1);
QuicksortSequentialInPlace(arr, pivot + 1, right);
}
}
public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
QuicksortSequentialInPlace(arr, left, right);
else
{
int pivot = Partition(arr, left, right);
Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1),
() => QuicksortParallelInPlace(arr, pivot + 1, right));
}
}
}
private static void Swap<T>(T[] arr, int i, int j)
{
T tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
private static int Partition<T>(T[] arr, int low, int high)
where T : IComparable<T>
{
// Simple partitioning implementation
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (arr[i].CompareTo(pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
#endregion
#region Public Static Methods
/// <summary>
/// Sequential quicksort.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortSequential<T>(T[] arr, Func<T, T, int> comparer)
{
QuicksortSequentialInPlace(arr, 0, arr.Length - 1, comparer);
}
/// <summary>
/// Parallel quicksort
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortParallel<T>(T[] arr, Func<T, T, int> comparer)
{
QuicksortParallelInPlace(arr, 0, arr.Length - 1, comparer);
}
#endregion
#region Private Static Methods
public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
{
if (right > left)
{
int pivot = Partition(arr, left, right, comparer);
QuicksortSequentialInPlace(arr, left, pivot - 1, comparer);
QuicksortSequentialInPlace(arr, pivot + 1, right, comparer);
}
}
public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
{
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
{
QuicksortSequentialInPlace(arr, left, right, comparer);
}
else
{
int pivot = Partition(arr, left, right, comparer);
Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1, comparer),
() => QuicksortParallelInPlace(arr, pivot + 1, right, comparer));
}
}
}
private static int Partition<T>(T[] arr, int low, int high, Func<T, T, int> comparer)
{
// Simple partitioning implementation
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (comparer(arr[i], pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
#endregion
}
}
}