"""将source post对应的posts划分成不定长的post batch序列
Params:
weibo_id (str), source post对应的id,用于读取对应数据
N (int), 时间序列的基准time steps个数
Returns:
output (list), interval list, 每一个interval包含一定数量的post index
"""
# 不同时间间隔内的post数量不必相同)
path = "Weibo" + os_sep + "{}.json".format(weibo_id)
data = load_rawdata(data_path + path) # 基于weibo id加载包含转帖文本及时间戳的原始数据
tweet_list = [(idx, tweet["t"]) for idx, tweet in enumerate(data)]
total_timespan = tweet_list[-1][1] - tweet_list[0][1] # L(i)
time_interval = total_timespan / N # l
k = 0
pre_max_inters = [] # U_(k_1)
while True:
# Spliting series by the current time interval
k += 1
interval_num = int(total_timespan / time_interval)
output, inter_index = ConstructSeries(tweet_list, interval_num, time_interval)
max_inters = GetContinueInterval(inter_index) # maximum continue interval index
if len(pre_max_inters) < len(max_inters) < N:
time_interval = int(time_interval * 0.5) # Shorten the intervals
pre_max_inters = max_inters
if time_interval == 0:
output = output[max_inters[0]:max_inters[-1] + 1]
break
else:
output = output[max_inters[0]:max_inters[-1] + 1]
break
return output
原创文章,作者:506227337,如若转载,请注明出处:https://blog.ytso.com/tech/pnotes/273172.html