海量数据算法:如何从超过10G的记录IP地址的日志中,较快的找出登录次数最多的一个IP?
关注者
661被浏览
78,957登录后你可以
不限量看优质回答私信答主深度交流精彩内容一键收藏
下面这个程序运行时间是几十秒。
练习:找出 worse case,让运行时间长达几分钟甚至十几分钟,然后提出并实施改进措施。
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <vector>
static_assert(sizeof(void*) == 8, "64-bit only.");
std::vector<uint8_t> counts_;
struct IPcount
{
uint32_t ip;
uint32_t count;
bool operator<(IPcount rhs) const
{ return ip < rhs.ip; }
};
std::vector<IPcount> overflows_;
IPcount top;
void addOverflow(uint32_t ip)
{
IPcount newItem = { ip, 256 };
auto it = std::lower_bound(overflows_.begin(), overflows_.end(), newItem);
if (it != overflows_.end() && it->ip == ip)
{
it->count++;
assert(it->count != 0 && "you need larger count variable");
newItem.count = it->count;
}
else
{
overflows_.insert(it, newItem);
}
if (newItem.count > top.count)
top = newItem;
}
void add(uint32_t ip)
{
if (counts_[ip] == 255)
{
addOverflow(ip);
}
else
{
counts_[ip]++;
if (counts_[ip] > top.count)
{
top.ip = ip;
top.count = counts_[ip];
}
}
}
uint32_t getMostFrequenntIP()
{
return top.ip;
}
int main()
{
assert(counts_.max_size() > 0xFFFFFFFFUL);
counts_.resize(1L << 32);
printf("%zd\n", counts_.size());
add(0x1);
add(0x2);
add(0x2);
for (size_t i = 0; i < (1L << 32)-1; ++i)
add(0x3);
printf("%08x %u\n", top.ip, top.count);
}