大数据Spark “蘑菇云”行动第106课:Hive源码大师之路第四步:Hive中GroupBy和各种类型Join源码剖析
create.....skewed by(key) on。。。
set hive.optimize.skewjoin.compiletime
hive.skewjoin.key
/** * Process the row. * * @param row * The object representing the row. * @param tag * The tag of the row usually means which parent this row comes from. * Rows with the same tag should have exactly the same rowInspector * all the time. */ public abstract void process(Object row, int tag) throws HiveException;
set hive.groupBy.skewindata=true;
// Are we consuming too much memory
if (alias == numAliases - 1 && !(handleSkewJoin && skewJoinKeyContext.currBigKeyTag >= 0) &&
!hasLeftSemiJoin) {
if (sz == joinEmitInterval && !hasFilter(alias)) {
// The input is sorted by alias, so if we are already in the last join
// operand,
// we can emit some results now.
// Note this has to be done before adding the current row to the
// storage,
// to preserve the correctness for outer joins.
checkAndGenObject();
storage[alias].clearRows();
}
} else {
if (isLogInfoEnabled && (sz == nextSz)) {
// Print a message if we reached at least 1000 rows for a join operand
// We won't print a message for the last join operand since the size
// will never goes to joinEmitInterval.
LOG.info("table " + alias + " has " + sz + " rows for join key " + keyObject);
nextSz = getNextSize(nextSz);
}
}